models_performance = {}
import pandas and load training set and get only 2's and 3's
import pandas as pd
df = pd.read_csv("zip.train", delim_whitespace=True, header=None)
df = df[(df[0] == 2) | (df[0] == 3)]
inspecting the data
df
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4 | 3.0 | -1.000 | -1.000 | -1.000 | -1.000 | -1.000 | -0.928 | -0.204 | 0.751 | 0.466 | ... | 0.466 | 0.639 | 1.000 | 1.000 | 0.791 | 0.439 | -0.199 | -0.883 | -1.0 | -1.0 |
| 6 | 3.0 | -1.000 | -1.000 | -1.000 | -0.830 | 0.442 | 1.000 | 1.000 | 0.479 | -0.328 | ... | 1.000 | 0.671 | 0.345 | -0.507 | -1.000 | -1.000 | -1.000 | -1.000 | -1.0 | -1.0 |
| 26 | 3.0 | -1.000 | -1.000 | -1.000 | -1.000 | -1.000 | -0.104 | 0.549 | 0.579 | 0.579 | ... | 0.388 | 0.579 | 0.811 | 1.000 | 1.000 | 0.715 | 0.107 | -0.526 | -1.0 | -1.0 |
| 30 | 3.0 | -1.000 | -1.000 | -1.000 | -1.000 | -1.000 | -1.000 | -0.107 | 1.000 | 1.000 | ... | -0.280 | 0.322 | 0.813 | 1.000 | 1.000 | 0.633 | -0.144 | -0.994 | -1.0 | -1.0 |
| 35 | 3.0 | -1.000 | -1.000 | -1.000 | -1.000 | -0.674 | 0.492 | 0.573 | 0.755 | -0.018 | ... | 0.537 | 1.000 | 1.000 | 0.689 | -0.530 | -1.000 | -1.000 | -1.000 | -1.0 | -1.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7282 | 3.0 | -1.000 | -0.882 | -0.334 | 0.267 | 0.333 | 0.749 | 1.000 | 1.000 | 1.000 | ... | 0.968 | 1.000 | 1.000 | 1.000 | 0.809 | 0.325 | -0.820 | -1.000 | -1.0 | -1.0 |
| 7283 | 3.0 | -0.985 | -0.048 | 0.226 | 0.226 | 0.226 | -0.355 | -0.807 | -1.000 | -0.726 | ... | -0.307 | -0.555 | -0.555 | -0.555 | -0.556 | -1.000 | -1.000 | -1.000 | -1.0 | -1.0 |
| 7286 | 3.0 | -1.000 | -1.000 | -1.000 | -0.988 | -0.527 | -0.208 | 0.620 | 1.000 | 0.467 | ... | -0.116 | 0.899 | 0.416 | -0.510 | -1.000 | -1.000 | -1.000 | -1.000 | -1.0 | -1.0 |
| 7287 | 3.0 | -1.000 | -1.000 | -1.000 | -0.990 | 0.708 | 0.557 | 0.347 | -0.107 | -0.758 | ... | 0.697 | 0.636 | 0.167 | -0.968 | -1.000 | -1.000 | -1.000 | -1.000 | -1.0 | -1.0 |
| 7288 | 3.0 | -1.000 | -1.000 | -1.000 | -0.783 | -0.984 | -0.827 | 0.068 | 1.000 | 1.000 | ... | 0.805 | 1.000 | 1.000 | 0.727 | -0.342 | -0.933 | -1.000 | -1.000 | -1.0 | -1.0 |
1389 rows × 257 columns
first colume is the label (Y), the rest of columns are variables (X)
Y = df.loc[:,0:0]
X = df.loc[:,1:]
load test set and use 2's and 3's and separate X and Y
test_df = pd.read_csv("zip.test", delim_whitespace=True, header=None)
test_df = test_df[(test_df[0] == 2) | (test_df[0] == 3)]
test_Y = test_df.loc[:,0:0]
test_X = test_df.loc[:,1:]
import model fit the training data
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
regr = linear_model.LinearRegression()
# Train the model using the training sets
regr.fit(X, Y)
LinearRegression()
use the linear regression model to predict and calculate the accuracy
# Make predictions using the testing set
predict_Y = regr.predict(test_X)
acc_lr = accuracy_score(test_Y, predict_Y.round())
models_performance["LR"] = acc_lr
print(f'Accuracy score linear regression: {acc_lr:.2f}')
Accuracy score linear regression: 0.96
import k nearest model loop through all k's and calculate accuracy
from sklearn.neighbors import KNeighborsClassifier
k = [1,3,5,7,15]
for i in k:
neigh = KNeighborsClassifier(n_neighbors=i)
neigh.fit(X, Y.values.ravel())
kn_predict = neigh.predict(test_X)
acc_kn = accuracy_score(test_Y, kn_predict.round())
models_performance["k"+str(i)] = acc_kn
print(f'Accuracy score [k={i}]: {acc_kn:.2f}')
Accuracy score [k=1]: 0.98 Accuracy score [k=3]: 0.97 Accuracy score [k=5]: 0.97 Accuracy score [k=7]: 0.97 Accuracy score [k=15]: 0.96
models_performance
{'LR': 0.9587912087912088,
'k1': 0.9752747252747253,
'k3': 0.9697802197802198,
'k5': 0.9697802197802198,
'k7': 0.967032967032967,
'k15': 0.9615384615384616}
import matplotlib.pyplot as plt
#models_performance.update((x, y*100) for x, y in models_performance.items())
plt.bar(range(len(models_performance)), list(models_performance.values()), align='center')
plt.xticks(range(len(models_performance)), list(models_performance.keys()))
plt.title('Models Accuracy Comparision')
plt.xlabel('Models')
plt.ylabel('Accuracy %')
plt.show()
fig, ax = plt.subplots()
width = 0.75
ax.barh(list(models_performance.keys()), list(models_performance.values()), width)
plt.title('Models Accuracy Comparision')
plt.xlabel('Accuracy %')
plt.ylabel('Models')
for i, v in enumerate(list(models_performance.values())):
ax.text(v + 1, i, str(round(v,1))+'%',
color = 'red')
plt.show()