models_performance = {}


import pandas as pd
df = pd.read_csv("zip.train", delim_whitespace=True, header=None)
df = df[(df[0] == 2) | (df[0] == 3)]

df


Y = df.loc[:,0:0]
X = df.loc[:,1:]


test_df = pd.read_csv("zip.test", delim_whitespace=True, header=None)
test_df = test_df[(test_df[0] == 2) | (test_df[0] == 3)]
test_Y = test_df.loc[:,0:0]
test_X = test_df.loc[:,1:]


import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

regr = linear_model.LinearRegression()
# Train the model using the training sets
regr.fit(X, Y)

LinearRegression()


# Make predictions using the testing set
predict_Y = regr.predict(test_X)
acc_lr = accuracy_score(test_Y, predict_Y.round())
models_performance["LR"] = acc_lr
print(f'Accuracy score linear regression: {acc_lr:.2f}')

Accuracy score linear regression: 0.96


from sklearn.neighbors import KNeighborsClassifier

k = [1,3,5,7,15]

for i in k:
    neigh = KNeighborsClassifier(n_neighbors=i)
    neigh.fit(X, Y.values.ravel())
    kn_predict = neigh.predict(test_X)
    acc_kn = accuracy_score(test_Y, kn_predict.round())
    models_performance["k"+str(i)] = acc_kn
    
    print(f'Accuracy score [k={i}]: {acc_kn:.2f}')

Accuracy score [k=1]: 0.98
Accuracy score [k=3]: 0.97
Accuracy score [k=5]: 0.97
Accuracy score [k=7]: 0.97
Accuracy score [k=15]: 0.96


models_performance

{'LR': 0.9587912087912088,
 'k1': 0.9752747252747253,
 'k3': 0.9697802197802198,
 'k5': 0.9697802197802198,
 'k7': 0.967032967032967,
 'k15': 0.9615384615384616}


import matplotlib.pyplot as plt

#models_performance.update((x, y*100) for x, y in models_performance.items())

plt.bar(range(len(models_performance)), list(models_performance.values()), align='center')
plt.xticks(range(len(models_performance)), list(models_performance.keys()))

plt.title('Models Accuracy Comparision')
plt.xlabel('Models')
plt.ylabel('Accuracy %')


plt.show()


fig, ax = plt.subplots()
width = 0.75
  
ax.barh(list(models_performance.keys()), list(models_performance.values()), width)
plt.title('Models Accuracy Comparision')
plt.xlabel('Accuracy %')
plt.ylabel('Models')

for i, v in enumerate(list(models_performance.values())):
    ax.text(v + 1, i, str(round(v,1))+'%', 
            color = 'red')
plt.show()

	0	1	2	3	4	5	6	7	8	9	...	247	248	249	250	251	252	253	254	255	256
4	3.0	-1.000	-1.000	-1.000	-1.000	-1.000	-0.928	-0.204	0.751	0.466	...	0.466	0.639	1.000	1.000	0.791	0.439	-0.199	-0.883	-1.0	-1.0
6	3.0	-1.000	-1.000	-1.000	-0.830	0.442	1.000	1.000	0.479	-0.328	...	1.000	0.671	0.345	-0.507	-1.000	-1.000	-1.000	-1.000	-1.0	-1.0
26	3.0	-1.000	-1.000	-1.000	-1.000	-1.000	-0.104	0.549	0.579	0.579	...	0.388	0.579	0.811	1.000	1.000	0.715	0.107	-0.526	-1.0	-1.0
30	3.0	-1.000	-1.000	-1.000	-1.000	-1.000	-1.000	-0.107	1.000	1.000	...	-0.280	0.322	0.813	1.000	1.000	0.633	-0.144	-0.994	-1.0	-1.0
35	3.0	-1.000	-1.000	-1.000	-1.000	-0.674	0.492	0.573	0.755	-0.018	...	0.537	1.000	1.000	0.689	-0.530	-1.000	-1.000	-1.000	-1.0	-1.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
7282	3.0	-1.000	-0.882	-0.334	0.267	0.333	0.749	1.000	1.000	1.000	...	0.968	1.000	1.000	1.000	0.809	0.325	-0.820	-1.000	-1.0	-1.0
7283	3.0	-0.985	-0.048	0.226	0.226	0.226	-0.355	-0.807	-1.000	-0.726	...	-0.307	-0.555	-0.555	-0.555	-0.556	-1.000	-1.000	-1.000	-1.0	-1.0
7286	3.0	-1.000	-1.000	-1.000	-0.988	-0.527	-0.208	0.620	1.000	0.467	...	-0.116	0.899	0.416	-0.510	-1.000	-1.000	-1.000	-1.000	-1.0	-1.0
7287	3.0	-1.000	-1.000	-1.000	-0.990	0.708	0.557	0.347	-0.107	-0.758	...	0.697	0.636	0.167	-0.968	-1.000	-1.000	-1.000	-1.000	-1.0	-1.0
7288	3.0	-1.000	-1.000	-1.000	-0.783	-0.984	-0.827	0.068	1.000	1.000	...	0.805	1.000	1.000	0.727	-0.342	-0.933	-1.000	-1.000	-1.0	-1.0