Wine Quality Prediction
__author__ = "Donald Ghazi"
__email__ = "donald@donaldghazi.com"
__website__ = "donaldghazi.com"
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
from sklearn.datasets import make_blobs
from sklearn import linear_model, datasets, metrics
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv("data.csv")
df = df.ix[:,1:14]
df
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 5.8 | 0.555 | 0.26 | 4.50 | 0.053 | 17.0 | 126.0 | 0.99430 | 3.24 | 0.46 | 9.1 | 0.5 | 0.0 |
1 | 6.3 | 0.360 | 0.19 | 3.20 | 0.075 | 15.0 | 39.0 | 0.99560 | 3.56 | 0.52 | 12.7 | 0.6 | 1.0 |
2 | 6.4 | 0.210 | 0.50 | 11.60 | 0.042 | 45.0 | 153.0 | 0.99720 | 3.15 | 0.43 | 8.8 | 0.5 | 0.0 |
3 | 6.6 | 0.330 | 0.24 | 16.05 | 0.045 | 31.0 | 147.0 | 0.99822 | 3.08 | 0.52 | 9.2 | 0.5 | 0.0 |
4 | 6.4 | 0.230 | 0.33 | 1.15 | 0.044 | 15.5 | 217.5 | 0.99200 | 3.33 | 0.44 | 11.0 | 0.6 | 0.0 |
5 | 7.2 | 0.510 | 0.24 | 10.00 | 0.093 | 35.0 | 197.0 | 0.99810 | 3.41 | 0.47 | 9.0 | 0.5 | 0.0 |
6 | 7.1 | 0.140 | 0.35 | 1.40 | 0.039 | 24.0 | 128.0 | 0.99212 | 2.97 | 0.68 | 10.4 | 0.5 | 0.0 |
7 | 7.3 | 0.260 | 0.53 | 12.70 | 0.047 | 60.5 | 156.0 | 0.99840 | 3.06 | 0.45 | 9.1 | 0.6 | 0.0 |
8 | 5.8 | 0.540 | 0.00 | 1.40 | 0.033 | 40.0 | 107.0 | 0.98918 | 3.26 | 0.35 | 12.4 | 0.5 | 0.0 |
9 | 8.7 | 0.310 | 0.73 | 14.35 | 0.044 | 27.0 | 191.0 | 1.00013 | 2.96 | 0.88 | 8.7 | 0.5 | 0.0 |
10 | 8.3 | 0.330 | 0.42 | 1.15 | 0.033 | 18.0 | 96.0 | 0.99110 | 3.20 | 0.32 | 12.4 | 0.3 | 0.0 |
11 | 8.5 | 0.280 | 0.56 | 1.80 | 0.092 | 35.0 | 103.0 | 0.99690 | 3.30 | 0.75 | 10.5 | 0.7 | 1.0 |
12 | 6.9 | 0.260 | 0.27 | 4.20 | 0.031 | 20.0 | 80.0 | 0.99089 | 3.12 | 0.39 | 11.5 | 0.6 | 0.0 |
13 | 6.2 | 0.255 | 0.27 | 1.30 | 0.037 | 30.0 | 86.0 | 0.98834 | 3.05 | 0.59 | 12.9 | 0.7 | 0.0 |
14 | 7.4 | 0.635 | 0.10 | 2.40 | 0.080 | 16.0 | 33.0 | 0.99736 | 3.58 | 0.69 | 10.8 | 0.7 | 1.0 |
15 | 5.7 | 0.250 | 0.27 | 10.80 | 0.050 | 58.0 | 116.0 | 0.99592 | 3.10 | 0.50 | 9.8 | 0.6 | 0.0 |
16 | 6.5 | 0.260 | 0.31 | 3.60 | 0.030 | 36.0 | 92.0 | 0.99026 | 3.22 | 0.62 | 12.6 | 0.8 | 0.0 |
17 | 6.8 | 0.320 | 0.21 | 2.20 | 0.044 | 15.0 | 68.0 | 0.99320 | 3.17 | 0.39 | 9.4 | 0.6 | 0.0 |
18 | 7.5 | 0.150 | 0.38 | 1.80 | 0.054 | 19.0 | 101.0 | 0.99460 | 3.24 | 0.44 | 10.0 | 0.5 | 0.0 |
19 | 6.7 | 0.540 | 0.27 | 7.10 | 0.049 | 8.0 | 178.0 | 0.99502 | 3.16 | 0.38 | 9.4 | 0.4 | 0.0 |
20 | 6.5 | 0.115 | 0.29 | 1.95 | 0.038 | 73.0 | 166.0 | 0.98900 | 3.12 | 0.25 | 12.9 | 0.7 | 0.0 |
21 | 6.1 | 0.560 | 0.00 | 2.20 | 0.079 | 6.0 | 9.0 | 0.99480 | 3.59 | 0.54 | 11.5 | 0.6 | 1.0 |
22 | 6.6 | 0.290 | 0.29 | 1.80 | 0.036 | 38.0 | 102.0 | 0.98819 | 3.08 | 0.42 | 13.7 | 0.7 | 0.0 |
23 | 7.9 | 0.190 | 0.42 | 1.60 | 0.057 | 18.0 | 30.0 | 0.99400 | 3.29 | 0.69 | 11.2 | 0.6 | 1.0 |
24 | 9.5 | 0.885 | 0.27 | 2.30 | 0.084 | 31.0 | 145.0 | 0.99780 | 3.24 | 0.53 | 9.4 | 0.5 | 1.0 |
25 | 6.4 | 0.500 | 0.20 | 2.40 | 0.059 | 19.0 | 112.0 | 0.99314 | 3.18 | 0.40 | 9.2 | 0.6 | 0.0 |
26 | 7.4 | 0.630 | 0.07 | 2.40 | 0.090 | 11.0 | 37.0 | 0.99790 | 3.43 | 0.76 | 9.7 | 0.6 | 1.0 |
27 | 7.6 | 0.190 | 0.37 | 13.10 | 0.033 | 52.0 | 151.0 | 0.99726 | 3.18 | 0.79 | 10.4 | 0.6 | 0.0 |
28 | 6.2 | 0.330 | 0.14 | 4.80 | 0.052 | 27.0 | 128.0 | 0.99475 | 3.21 | 0.48 | 9.4 | 0.5 | 0.0 |
29 | 7.0 | 0.170 | 0.33 | 4.00 | 0.034 | 17.0 | 127.0 | 0.99340 | 3.19 | 0.39 | 10.6 | 0.7 | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
5970 | 6.9 | 0.400 | 0.17 | 12.90 | 0.033 | 59.0 | 186.0 | 0.99754 | 3.08 | 0.49 | 9.4 | 0.5 | 0.0 |
5971 | 6.7 | 0.230 | 0.33 | 8.10 | 0.048 | 45.0 | 176.0 | 0.99472 | 3.11 | 0.52 | 10.1 | 0.6 | 0.0 |
5972 | 7.9 | 0.260 | 0.41 | 15.15 | 0.040 | 38.0 | 216.0 | 0.99760 | 2.96 | 0.60 | 10.0 | 0.6 | 0.0 |
5973 | 10.8 | 0.500 | 0.46 | 2.50 | 0.073 | 5.0 | 27.0 | 1.00010 | 3.05 | 0.64 | 9.5 | 0.5 | 1.0 |
5974 | 7.9 | 0.255 | 0.26 | 2.00 | 0.026 | 40.0 | 190.0 | 0.99320 | 3.04 | 0.39 | 11.2 | 0.6 | 0.0 |
5975 | 7.0 | 0.190 | 0.23 | 5.70 | 0.123 | 27.0 | 104.0 | 0.99540 | 3.04 | 0.54 | 9.4 | 0.6 | 0.0 |
5976 | 6.4 | 0.280 | 0.29 | 1.60 | 0.052 | 34.0 | 127.0 | 0.99290 | 3.48 | 0.56 | 10.5 | 0.7 | 0.0 |
5977 | 7.0 | 0.420 | 0.35 | 1.60 | 0.088 | 16.0 | 39.0 | 0.99610 | 3.34 | 0.55 | 9.2 | 0.5 | 1.0 |
5978 | 7.4 | 0.550 | 0.22 | 2.20 | 0.106 | 12.0 | 72.0 | 0.99590 | 3.05 | 0.63 | 9.2 | 0.5 | 1.0 |
5979 | 7.2 | 0.240 | 0.29 | 3.00 | 0.036 | 17.0 | 117.0 | 0.99411 | 3.36 | 0.68 | 10.1 | 0.6 | 0.0 |
5980 | 6.8 | 0.310 | 0.30 | 8.00 | 0.028 | 33.0 | 122.0 | 0.99164 | 3.13 | 0.63 | 12.6 | 0.7 | 0.0 |
5981 | 6.7 | 0.210 | 0.34 | 1.40 | 0.049 | 36.0 | 112.0 | 0.99091 | 3.02 | 0.50 | 11.0 | 0.6 | 0.0 |
5982 | 7.1 | 0.130 | 0.29 | 15.50 | 0.064 | 56.0 | 115.5 | 0.99737 | 3.16 | 0.41 | 9.7 | 0.7 | 0.0 |
5983 | 8.1 | 0.240 | 0.26 | 11.00 | 0.043 | 41.0 | 211.0 | 0.99676 | 3.11 | 0.49 | 10.0 | 0.6 | 0.0 |
5984 | 7.4 | 0.160 | 0.49 | 1.20 | 0.055 | 18.0 | 150.0 | 0.99170 | 3.23 | 0.47 | 11.2 | 0.6 | 0.0 |
5985 | 6.1 | 0.340 | 0.25 | 1.80 | 0.084 | 4.0 | 28.0 | 0.99464 | 3.36 | 0.44 | 10.1 | 0.5 | 1.0 |
5986 | 7.5 | 0.330 | 0.36 | 2.60 | 0.051 | 26.0 | 126.0 | 0.99097 | 3.32 | 0.53 | 12.7 | 0.6 | 0.0 |
5987 | 6.8 | 0.210 | 0.27 | 18.15 | 0.042 | 41.0 | 146.0 | 1.00010 | 3.30 | 0.36 | 8.7 | 0.5 | 0.0 |
5988 | 5.1 | 0.140 | 0.25 | 0.70 | 0.039 | 15.0 | 89.0 | 0.99190 | 3.22 | 0.43 | 9.2 | 0.6 | 0.0 |
5989 | 8.2 | 0.250 | 0.46 | 3.75 | 0.050 | 14.0 | 102.0 | 0.99524 | 3.28 | 0.58 | 9.7 | 0.5 | 0.0 |
5990 | 7.8 | 0.820 | 0.29 | 4.30 | 0.083 | 21.0 | 64.0 | 0.99642 | 3.16 | 0.53 | 9.4 | 0.5 | 1.0 |
5991 | 6.1 | 0.380 | 0.14 | 3.90 | 0.060 | 27.0 | 113.0 | 0.99344 | 3.07 | 0.34 | 9.2 | 0.4 | 0.0 |
5992 | 7.4 | 0.310 | 0.28 | 1.60 | 0.050 | 33.0 | 137.0 | 0.99290 | 3.31 | 0.56 | 10.5 | 0.6 | 0.0 |
5993 | 7.2 | 0.360 | 0.36 | 5.70 | 0.038 | 26.0 | 98.0 | 0.99140 | 2.93 | 0.59 | 12.5 | 0.7 | 0.0 |
5994 | 6.9 | 0.630 | 0.01 | 2.40 | 0.076 | 14.0 | 39.0 | 0.99522 | 3.34 | 0.53 | 10.8 | 0.6 | 1.0 |
5995 | 8.8 | 0.480 | 0.41 | 3.30 | 0.092 | 26.0 | 52.0 | 0.99820 | 3.31 | 0.53 | 10.5 | 0.6 | 1.0 |
5996 | 6.3 | 0.390 | 0.08 | 1.70 | 0.066 | 3.0 | 20.0 | 0.99540 | 3.34 | 0.58 | 9.4 | 0.5 | 1.0 |
5997 | 7.3 | 0.200 | 0.37 | 1.20 | 0.037 | 48.0 | 119.0 | 0.99200 | 3.32 | 0.49 | 10.9 | 0.6 | 0.0 |
5998 | 8.5 | 0.250 | 0.27 | 4.70 | 0.031 | 31.0 | 92.0 | 0.99220 | 3.01 | 0.33 | 12.0 | 0.6 | 0.0 |
5999 | 6.6 | 0.360 | 0.21 | 1.50 | 0.049 | 39.0 | 184.0 | 0.99280 | 3.18 | 0.41 | 9.9 | 0.6 | 0.0 |
6000 rows × 13 columns
plt.scatter(df['12'],df['0'])
plt.show()
df.groupby('12')['0'].plot(kind='density')
plt.legend(labels=['Red','White'])
plt.show()
df.groupby('12')['1'].plot(kind='density')
plt.legend(labels=['Red','White'])
plt.show()
df.groupby('12')['2'].plot(kind='density')
plt.legend(labels=['Red','White'])
plt.show()
df.groupby('12')['4'].plot(kind='density')
plt.legend(labels=['Red','White'])
plt.show()
df.groupby('12')['5'].plot(kind='density')
plt.legend(labels=['Red','White'])
plt.show()
labels = df.ix[:,12:13]
features = df.ix[:, 0:12]
training_proportion = (6000 / 100) * 80
training_features = features.ix[:training_proportion]
validation_features = features.ix[training_proportion:]
training_labels = labels.ix[:training_proportion]
validation_labels = labels.ix[training_proportion:]
def modelTraining(training_features, training_labels, validation_features, validation_labels, n, regression_type):
x = training_features[:n]
y = training_labels[:n]
lrm = linear_model.LogisticRegression(penalty=regression_type)
lrm.fit(x, y)
pred = lrm.predict(validation_features)
score = metrics.accuracy_score(validation_labels, pred)
return score
sample_points = [100, 200, 500, 1000, 2000, 4800]
L1Scores = []
for x in sample_points:
L1Scores.append(modelTraining(training_features,
training_labels,
validation_features,
validation_labels,
x,
"l1"))
L1Scores
[0.96499999999999997, 0.97416666666666663, 0.97583333333333333, 0.98083333333333333, 0.98333333333333328, 0.98750000000000004]
plt.plot(sample_points, L1Scores, 'ro')
plt.xlabel('n training samples')
plt.ylabel('validation accuracy')
plt.show()
L2Scores = []
for x in sample_points:
L2Scores.append(modelTraining(training_features,
training_labels,
validation_features,
validation_labels,
x,
"l2"))
L2Scores
[0.96416666666666662, 0.96750000000000003, 0.96999999999999997, 0.97666666666666668, 0.98166666666666669, 0.98499999999999999]
plt.plot(sample_points, L2Scores, 'ro')
plt.xlabel('n training samples')
plt.ylabel('validation accuracy')
plt.show()
L1 performed better than L2, as can be seen by the higher validation accuracy in the first graph for all data points
test = pd.read_csv("./test.csv")
test = test.ix[:,1:14]
test_features = test.ix[:, 0:12]
lrm2 = linear_model.LogisticRegression(penalty='l1')
lrm2.fit(features, labels)
results = lrm2.predict(test)
np.savetxt("./submission.csv", results, delimiter=',')