Wine Quality Prediction

In [ ]:
__author__ = "Donald Ghazi"
__email__ = "donald@donaldghazi.com"
__website__ = "donaldghazi.com"
In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
from sklearn.datasets import make_blobs
from sklearn import linear_model, datasets, metrics
import warnings
warnings.filterwarnings('ignore')
In [2]:
df = pd.read_csv("data.csv")
df = df.ix[:,1:14]
df
Out[2]:
0 1 2 3 4 5 6 7 8 9 10 11 12
0 5.8 0.555 0.26 4.50 0.053 17.0 126.0 0.99430 3.24 0.46 9.1 0.5 0.0
1 6.3 0.360 0.19 3.20 0.075 15.0 39.0 0.99560 3.56 0.52 12.7 0.6 1.0
2 6.4 0.210 0.50 11.60 0.042 45.0 153.0 0.99720 3.15 0.43 8.8 0.5 0.0
3 6.6 0.330 0.24 16.05 0.045 31.0 147.0 0.99822 3.08 0.52 9.2 0.5 0.0
4 6.4 0.230 0.33 1.15 0.044 15.5 217.5 0.99200 3.33 0.44 11.0 0.6 0.0
5 7.2 0.510 0.24 10.00 0.093 35.0 197.0 0.99810 3.41 0.47 9.0 0.5 0.0
6 7.1 0.140 0.35 1.40 0.039 24.0 128.0 0.99212 2.97 0.68 10.4 0.5 0.0
7 7.3 0.260 0.53 12.70 0.047 60.5 156.0 0.99840 3.06 0.45 9.1 0.6 0.0
8 5.8 0.540 0.00 1.40 0.033 40.0 107.0 0.98918 3.26 0.35 12.4 0.5 0.0
9 8.7 0.310 0.73 14.35 0.044 27.0 191.0 1.00013 2.96 0.88 8.7 0.5 0.0
10 8.3 0.330 0.42 1.15 0.033 18.0 96.0 0.99110 3.20 0.32 12.4 0.3 0.0
11 8.5 0.280 0.56 1.80 0.092 35.0 103.0 0.99690 3.30 0.75 10.5 0.7 1.0
12 6.9 0.260 0.27 4.20 0.031 20.0 80.0 0.99089 3.12 0.39 11.5 0.6 0.0
13 6.2 0.255 0.27 1.30 0.037 30.0 86.0 0.98834 3.05 0.59 12.9 0.7 0.0
14 7.4 0.635 0.10 2.40 0.080 16.0 33.0 0.99736 3.58 0.69 10.8 0.7 1.0
15 5.7 0.250 0.27 10.80 0.050 58.0 116.0 0.99592 3.10 0.50 9.8 0.6 0.0
16 6.5 0.260 0.31 3.60 0.030 36.0 92.0 0.99026 3.22 0.62 12.6 0.8 0.0
17 6.8 0.320 0.21 2.20 0.044 15.0 68.0 0.99320 3.17 0.39 9.4 0.6 0.0
18 7.5 0.150 0.38 1.80 0.054 19.0 101.0 0.99460 3.24 0.44 10.0 0.5 0.0
19 6.7 0.540 0.27 7.10 0.049 8.0 178.0 0.99502 3.16 0.38 9.4 0.4 0.0
20 6.5 0.115 0.29 1.95 0.038 73.0 166.0 0.98900 3.12 0.25 12.9 0.7 0.0
21 6.1 0.560 0.00 2.20 0.079 6.0 9.0 0.99480 3.59 0.54 11.5 0.6 1.0
22 6.6 0.290 0.29 1.80 0.036 38.0 102.0 0.98819 3.08 0.42 13.7 0.7 0.0
23 7.9 0.190 0.42 1.60 0.057 18.0 30.0 0.99400 3.29 0.69 11.2 0.6 1.0
24 9.5 0.885 0.27 2.30 0.084 31.0 145.0 0.99780 3.24 0.53 9.4 0.5 1.0
25 6.4 0.500 0.20 2.40 0.059 19.0 112.0 0.99314 3.18 0.40 9.2 0.6 0.0
26 7.4 0.630 0.07 2.40 0.090 11.0 37.0 0.99790 3.43 0.76 9.7 0.6 1.0
27 7.6 0.190 0.37 13.10 0.033 52.0 151.0 0.99726 3.18 0.79 10.4 0.6 0.0
28 6.2 0.330 0.14 4.80 0.052 27.0 128.0 0.99475 3.21 0.48 9.4 0.5 0.0
29 7.0 0.170 0.33 4.00 0.034 17.0 127.0 0.99340 3.19 0.39 10.6 0.7 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
5970 6.9 0.400 0.17 12.90 0.033 59.0 186.0 0.99754 3.08 0.49 9.4 0.5 0.0
5971 6.7 0.230 0.33 8.10 0.048 45.0 176.0 0.99472 3.11 0.52 10.1 0.6 0.0
5972 7.9 0.260 0.41 15.15 0.040 38.0 216.0 0.99760 2.96 0.60 10.0 0.6 0.0
5973 10.8 0.500 0.46 2.50 0.073 5.0 27.0 1.00010 3.05 0.64 9.5 0.5 1.0
5974 7.9 0.255 0.26 2.00 0.026 40.0 190.0 0.99320 3.04 0.39 11.2 0.6 0.0
5975 7.0 0.190 0.23 5.70 0.123 27.0 104.0 0.99540 3.04 0.54 9.4 0.6 0.0
5976 6.4 0.280 0.29 1.60 0.052 34.0 127.0 0.99290 3.48 0.56 10.5 0.7 0.0
5977 7.0 0.420 0.35 1.60 0.088 16.0 39.0 0.99610 3.34 0.55 9.2 0.5 1.0
5978 7.4 0.550 0.22 2.20 0.106 12.0 72.0 0.99590 3.05 0.63 9.2 0.5 1.0
5979 7.2 0.240 0.29 3.00 0.036 17.0 117.0 0.99411 3.36 0.68 10.1 0.6 0.0
5980 6.8 0.310 0.30 8.00 0.028 33.0 122.0 0.99164 3.13 0.63 12.6 0.7 0.0
5981 6.7 0.210 0.34 1.40 0.049 36.0 112.0 0.99091 3.02 0.50 11.0 0.6 0.0
5982 7.1 0.130 0.29 15.50 0.064 56.0 115.5 0.99737 3.16 0.41 9.7 0.7 0.0
5983 8.1 0.240 0.26 11.00 0.043 41.0 211.0 0.99676 3.11 0.49 10.0 0.6 0.0
5984 7.4 0.160 0.49 1.20 0.055 18.0 150.0 0.99170 3.23 0.47 11.2 0.6 0.0
5985 6.1 0.340 0.25 1.80 0.084 4.0 28.0 0.99464 3.36 0.44 10.1 0.5 1.0
5986 7.5 0.330 0.36 2.60 0.051 26.0 126.0 0.99097 3.32 0.53 12.7 0.6 0.0
5987 6.8 0.210 0.27 18.15 0.042 41.0 146.0 1.00010 3.30 0.36 8.7 0.5 0.0
5988 5.1 0.140 0.25 0.70 0.039 15.0 89.0 0.99190 3.22 0.43 9.2 0.6 0.0
5989 8.2 0.250 0.46 3.75 0.050 14.0 102.0 0.99524 3.28 0.58 9.7 0.5 0.0
5990 7.8 0.820 0.29 4.30 0.083 21.0 64.0 0.99642 3.16 0.53 9.4 0.5 1.0
5991 6.1 0.380 0.14 3.90 0.060 27.0 113.0 0.99344 3.07 0.34 9.2 0.4 0.0
5992 7.4 0.310 0.28 1.60 0.050 33.0 137.0 0.99290 3.31 0.56 10.5 0.6 0.0
5993 7.2 0.360 0.36 5.70 0.038 26.0 98.0 0.99140 2.93 0.59 12.5 0.7 0.0
5994 6.9 0.630 0.01 2.40 0.076 14.0 39.0 0.99522 3.34 0.53 10.8 0.6 1.0
5995 8.8 0.480 0.41 3.30 0.092 26.0 52.0 0.99820 3.31 0.53 10.5 0.6 1.0
5996 6.3 0.390 0.08 1.70 0.066 3.0 20.0 0.99540 3.34 0.58 9.4 0.5 1.0
5997 7.3 0.200 0.37 1.20 0.037 48.0 119.0 0.99200 3.32 0.49 10.9 0.6 0.0
5998 8.5 0.250 0.27 4.70 0.031 31.0 92.0 0.99220 3.01 0.33 12.0 0.6 0.0
5999 6.6 0.360 0.21 1.50 0.049 39.0 184.0 0.99280 3.18 0.41 9.9 0.6 0.0

6000 rows × 13 columns

In [3]:
plt.scatter(df['12'],df['0'])
plt.show()
In [4]:
df.groupby('12')['0'].plot(kind='density')
plt.legend(labels=['Red','White'])
plt.show()
In [5]:
df.groupby('12')['1'].plot(kind='density')
plt.legend(labels=['Red','White'])
plt.show()
In [6]:
df.groupby('12')['2'].plot(kind='density')
plt.legend(labels=['Red','White'])
plt.show()
In [7]:
df.groupby('12')['4'].plot(kind='density')
plt.legend(labels=['Red','White'])
plt.show()
In [8]:
df.groupby('12')['5'].plot(kind='density')
plt.legend(labels=['Red','White'])
plt.show()
In [9]:
labels = df.ix[:,12:13]
In [10]:
features = df.ix[:, 0:12]

training_proportion = (6000 / 100) * 80
training_features = features.ix[:training_proportion]
validation_features = features.ix[training_proportion:]
training_labels = labels.ix[:training_proportion]
validation_labels = labels.ix[training_proportion:]
In [11]:
def modelTraining(training_features, training_labels, validation_features, validation_labels, n, regression_type):
    x = training_features[:n]
    y = training_labels[:n]
    lrm = linear_model.LogisticRegression(penalty=regression_type)
    lrm.fit(x, y)
    
    pred = lrm.predict(validation_features)
    score = metrics.accuracy_score(validation_labels, pred)
    
    return score
In [12]:
sample_points = [100, 200, 500, 1000, 2000, 4800]

L1Scores = []

for x in sample_points:
    L1Scores.append(modelTraining(training_features, 
                                  training_labels, 
                                  validation_features, 
                                  validation_labels,
                                  x,
                                  "l1"))
    
L1Scores
Out[12]:
[0.96499999999999997,
 0.97416666666666663,
 0.97583333333333333,
 0.98083333333333333,
 0.98333333333333328,
 0.98750000000000004]
In [13]:
plt.plot(sample_points, L1Scores, 'ro')
plt.xlabel('n training samples')
plt.ylabel('validation accuracy')
plt.show()
In [14]:
L2Scores = []

for x in sample_points:
    L2Scores.append(modelTraining(training_features, 
                                  training_labels, 
                                  validation_features, 
                                  validation_labels,
                                  x,
                                  "l2"))
    
L2Scores
Out[14]:
[0.96416666666666662,
 0.96750000000000003,
 0.96999999999999997,
 0.97666666666666668,
 0.98166666666666669,
 0.98499999999999999]
In [15]:
plt.plot(sample_points, L2Scores, 'ro')
plt.xlabel('n training samples')
plt.ylabel('validation accuracy')
plt.show()

L1 performed better than L2, as can be seen by the higher validation accuracy in the first graph for all data points

In [16]:
test = pd.read_csv("./test.csv")
test = test.ix[:,1:14]
test_features = test.ix[:, 0:12]

lrm2 = linear_model.LogisticRegression(penalty='l1')
lrm2.fit(features, labels)
results = lrm2.predict(test)

np.savetxt("./submission.csv", results, delimiter=',')