__author__ = "Donald Ghazi"
__email__ = "donald@donaldghazi.com"
__website__ = "donaldghazi.com"


import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
from sklearn.datasets import make_blobs
from sklearn import linear_model, datasets, metrics
import warnings
warnings.filterwarnings('ignore')


df = pd.read_csv("data.csv")
df = df.ix[:,1:14]
df


plt.scatter(df['12'],df['0'])
plt.show()


df.groupby('12')['0'].plot(kind='density')
plt.legend(labels=['Red','White'])
plt.show()


df.groupby('12')['1'].plot(kind='density')
plt.legend(labels=['Red','White'])
plt.show()


df.groupby('12')['2'].plot(kind='density')
plt.legend(labels=['Red','White'])
plt.show()


df.groupby('12')['4'].plot(kind='density')
plt.legend(labels=['Red','White'])
plt.show()


df.groupby('12')['5'].plot(kind='density')
plt.legend(labels=['Red','White'])
plt.show()


labels = df.ix[:,12:13]


features = df.ix[:, 0:12]

training_proportion = (6000 / 100) * 80
training_features = features.ix[:training_proportion]
validation_features = features.ix[training_proportion:]
training_labels = labels.ix[:training_proportion]
validation_labels = labels.ix[training_proportion:]


def modelTraining(training_features, training_labels, validation_features, validation_labels, n, regression_type):
    x = training_features[:n]
    y = training_labels[:n]
    lrm = linear_model.LogisticRegression(penalty=regression_type)
    lrm.fit(x, y)
    
    pred = lrm.predict(validation_features)
    score = metrics.accuracy_score(validation_labels, pred)
    
    return score


sample_points = [100, 200, 500, 1000, 2000, 4800]

L1Scores = []

for x in sample_points:
    L1Scores.append(modelTraining(training_features, 
                                  training_labels, 
                                  validation_features, 
                                  validation_labels,
                                  x,
                                  "l1"))
    
L1Scores

[0.96499999999999997,
 0.97416666666666663,
 0.97583333333333333,
 0.98083333333333333,
 0.98333333333333328,
 0.98750000000000004]


plt.plot(sample_points, L1Scores, 'ro')
plt.xlabel('n training samples')
plt.ylabel('validation accuracy')
plt.show()


L2Scores = []

for x in sample_points:
    L2Scores.append(modelTraining(training_features, 
                                  training_labels, 
                                  validation_features, 
                                  validation_labels,
                                  x,
                                  "l2"))
    
L2Scores

[0.96416666666666662,
 0.96750000000000003,
 0.96999999999999997,
 0.97666666666666668,
 0.98166666666666669,
 0.98499999999999999]


plt.plot(sample_points, L2Scores, 'ro')
plt.xlabel('n training samples')
plt.ylabel('validation accuracy')
plt.show()


test = pd.read_csv("./test.csv")
test = test.ix[:,1:14]
test_features = test.ix[:, 0:12]

lrm2 = linear_model.LogisticRegression(penalty='l1')
lrm2.fit(features, labels)
results = lrm2.predict(test)

np.savetxt("./submission.csv", results, delimiter=',')

	0	1	2	3	4	5	6	7	8	9	10	11	12
0	5.8	0.555	0.26	4.50	0.053	17.0	126.0	0.99430	3.24	0.46	9.1	0.5	0.0
1	6.3	0.360	0.19	3.20	0.075	15.0	39.0	0.99560	3.56	0.52	12.7	0.6	1.0
2	6.4	0.210	0.50	11.60	0.042	45.0	153.0	0.99720	3.15	0.43	8.8	0.5	0.0
3	6.6	0.330	0.24	16.05	0.045	31.0	147.0	0.99822	3.08	0.52	9.2	0.5	0.0
4	6.4	0.230	0.33	1.15	0.044	15.5	217.5	0.99200	3.33	0.44	11.0	0.6	0.0
5	7.2	0.510	0.24	10.00	0.093	35.0	197.0	0.99810	3.41	0.47	9.0	0.5	0.0
6	7.1	0.140	0.35	1.40	0.039	24.0	128.0	0.99212	2.97	0.68	10.4	0.5	0.0
7	7.3	0.260	0.53	12.70	0.047	60.5	156.0	0.99840	3.06	0.45	9.1	0.6	0.0
8	5.8	0.540	0.00	1.40	0.033	40.0	107.0	0.98918	3.26	0.35	12.4	0.5	0.0
9	8.7	0.310	0.73	14.35	0.044	27.0	191.0	1.00013	2.96	0.88	8.7	0.5	0.0
10	8.3	0.330	0.42	1.15	0.033	18.0	96.0	0.99110	3.20	0.32	12.4	0.3	0.0
11	8.5	0.280	0.56	1.80	0.092	35.0	103.0	0.99690	3.30	0.75	10.5	0.7	1.0
12	6.9	0.260	0.27	4.20	0.031	20.0	80.0	0.99089	3.12	0.39	11.5	0.6	0.0
13	6.2	0.255	0.27	1.30	0.037	30.0	86.0	0.98834	3.05	0.59	12.9	0.7	0.0
14	7.4	0.635	0.10	2.40	0.080	16.0	33.0	0.99736	3.58	0.69	10.8	0.7	1.0
15	5.7	0.250	0.27	10.80	0.050	58.0	116.0	0.99592	3.10	0.50	9.8	0.6	0.0
16	6.5	0.260	0.31	3.60	0.030	36.0	92.0	0.99026	3.22	0.62	12.6	0.8	0.0
17	6.8	0.320	0.21	2.20	0.044	15.0	68.0	0.99320	3.17	0.39	9.4	0.6	0.0
18	7.5	0.150	0.38	1.80	0.054	19.0	101.0	0.99460	3.24	0.44	10.0	0.5	0.0
19	6.7	0.540	0.27	7.10	0.049	8.0	178.0	0.99502	3.16	0.38	9.4	0.4	0.0
20	6.5	0.115	0.29	1.95	0.038	73.0	166.0	0.98900	3.12	0.25	12.9	0.7	0.0
21	6.1	0.560	0.00	2.20	0.079	6.0	9.0	0.99480	3.59	0.54	11.5	0.6	1.0
22	6.6	0.290	0.29	1.80	0.036	38.0	102.0	0.98819	3.08	0.42	13.7	0.7	0.0
23	7.9	0.190	0.42	1.60	0.057	18.0	30.0	0.99400	3.29	0.69	11.2	0.6	1.0
24	9.5	0.885	0.27	2.30	0.084	31.0	145.0	0.99780	3.24	0.53	9.4	0.5	1.0
25	6.4	0.500	0.20	2.40	0.059	19.0	112.0	0.99314	3.18	0.40	9.2	0.6	0.0
26	7.4	0.630	0.07	2.40	0.090	11.0	37.0	0.99790	3.43	0.76	9.7	0.6	1.0
27	7.6	0.190	0.37	13.10	0.033	52.0	151.0	0.99726	3.18	0.79	10.4	0.6	0.0
28	6.2	0.330	0.14	4.80	0.052	27.0	128.0	0.99475	3.21	0.48	9.4	0.5	0.0
29	7.0	0.170	0.33	4.00	0.034	17.0	127.0	0.99340	3.19	0.39	10.6	0.7	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...
5970	6.9	0.400	0.17	12.90	0.033	59.0	186.0	0.99754	3.08	0.49	9.4	0.5	0.0
5971	6.7	0.230	0.33	8.10	0.048	45.0	176.0	0.99472	3.11	0.52	10.1	0.6	0.0
5972	7.9	0.260	0.41	15.15	0.040	38.0	216.0	0.99760	2.96	0.60	10.0	0.6	0.0
5973	10.8	0.500	0.46	2.50	0.073	5.0	27.0	1.00010	3.05	0.64	9.5	0.5	1.0
5974	7.9	0.255	0.26	2.00	0.026	40.0	190.0	0.99320	3.04	0.39	11.2	0.6	0.0
5975	7.0	0.190	0.23	5.70	0.123	27.0	104.0	0.99540	3.04	0.54	9.4	0.6	0.0
5976	6.4	0.280	0.29	1.60	0.052	34.0	127.0	0.99290	3.48	0.56	10.5	0.7	0.0
5977	7.0	0.420	0.35	1.60	0.088	16.0	39.0	0.99610	3.34	0.55	9.2	0.5	1.0
5978	7.4	0.550	0.22	2.20	0.106	12.0	72.0	0.99590	3.05	0.63	9.2	0.5	1.0
5979	7.2	0.240	0.29	3.00	0.036	17.0	117.0	0.99411	3.36	0.68	10.1	0.6	0.0
5980	6.8	0.310	0.30	8.00	0.028	33.0	122.0	0.99164	3.13	0.63	12.6	0.7	0.0
5981	6.7	0.210	0.34	1.40	0.049	36.0	112.0	0.99091	3.02	0.50	11.0	0.6	0.0
5982	7.1	0.130	0.29	15.50	0.064	56.0	115.5	0.99737	3.16	0.41	9.7	0.7	0.0
5983	8.1	0.240	0.26	11.00	0.043	41.0	211.0	0.99676	3.11	0.49	10.0	0.6	0.0
5984	7.4	0.160	0.49	1.20	0.055	18.0	150.0	0.99170	3.23	0.47	11.2	0.6	0.0
5985	6.1	0.340	0.25	1.80	0.084	4.0	28.0	0.99464	3.36	0.44	10.1	0.5	1.0
5986	7.5	0.330	0.36	2.60	0.051	26.0	126.0	0.99097	3.32	0.53	12.7	0.6	0.0
5987	6.8	0.210	0.27	18.15	0.042	41.0	146.0	1.00010	3.30	0.36	8.7	0.5	0.0
5988	5.1	0.140	0.25	0.70	0.039	15.0	89.0	0.99190	3.22	0.43	9.2	0.6	0.0
5989	8.2	0.250	0.46	3.75	0.050	14.0	102.0	0.99524	3.28	0.58	9.7	0.5	0.0
5990	7.8	0.820	0.29	4.30	0.083	21.0	64.0	0.99642	3.16	0.53	9.4	0.5	1.0
5991	6.1	0.380	0.14	3.90	0.060	27.0	113.0	0.99344	3.07	0.34	9.2	0.4	0.0
5992	7.4	0.310	0.28	1.60	0.050	33.0	137.0	0.99290	3.31	0.56	10.5	0.6	0.0
5993	7.2	0.360	0.36	5.70	0.038	26.0	98.0	0.99140	2.93	0.59	12.5	0.7	0.0
5994	6.9	0.630	0.01	2.40	0.076	14.0	39.0	0.99522	3.34	0.53	10.8	0.6	1.0
5995	8.8	0.480	0.41	3.30	0.092	26.0	52.0	0.99820	3.31	0.53	10.5	0.6	1.0
5996	6.3	0.390	0.08	1.70	0.066	3.0	20.0	0.99540	3.34	0.58	9.4	0.5	1.0
5997	7.3	0.200	0.37	1.20	0.037	48.0	119.0	0.99200	3.32	0.49	10.9	0.6	0.0
5998	8.5	0.250	0.27	4.70	0.031	31.0	92.0	0.99220	3.01	0.33	12.0	0.6	0.0
5999	6.6	0.360	0.21	1.50	0.049	39.0	184.0	0.99280	3.18	0.41	9.9	0.6	0.0