The k-Nearest Neighbors (kNN)



examples/ml/abalone.py
import requests
import os
import shutil
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingRegressor


def get_files():
    data_url =  "https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data"
    names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.names"

    filenames = []
    for url in (data_url, names_url):
        filename = url.split('/')[-1]
        filenames.append(filename)
        if not os.path.exists(filename):
            with requests.get(url, stream=True) as response:
                with open(filename, 'wb') as fh:
                    shutil.copyfileobj(response.raw, fh)
    return filenames



if __name__ == "__main__":
    data_file, names_file = get_files()
    columns = ["Sex", "Length", "Diameter", "Height", "Whole weight", "Shucked weight", "Viscera weight", "Shell weight", "Rings"]
    df = pd.read_csv(data_file, names=columns)
    #print(df.head())
    df = df.drop("Sex", axis=1)
    #print(df.head())

    #df["Rings"].hist(bins=15)
    #plt.show()

    #correlation_matrix = df.corr()
    #print(correlation_matrix["Rings"])

    X = df.drop("Rings", axis=1)
    X = X.values
    y = df["Rings"]
    y = y.values

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    knn_model = KNeighborsRegressor(n_neighbors=3).fit(X_train, y_train)


    train_predictions = knn_model.predict(X_train)
    train_mse = mean_squared_error(y_train, train_predictions)
    train_rmse = sqrt(train_mse)
    print(train_rmse) # 1.67

    test_predictions = knn_model.predict(X_test)
    test_mse = mean_squared_error(y_test, test_predictions)
    test_rmse = sqrt(test_mse)
    print(test_rmse) # 2.36
    # That is the number of years as errors between the prediction and the actual value
    # This looks like overfitting

    # cmap = sns.cubehelix_palette(as_cmap=True)
    # f, ax = plt.subplots()
    # # Length and Diameter, the two columns with strong correllation
    # points = ax.scatter(X_test[:, 0], X_test[:, 1], c=test_predictions, s=50, cmap=cmap)
    # f.colorbar(points)
    # plt.show()

    # cmap = sns.cubehelix_palette(as_cmap=True)
    # f, ax = plt.subplots()
    # # Length and Diameter, the two columns with strong correllation
    # points = ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, s=50, cmap=cmap)
    # f.colorbar(points)
    # plt.show()


    # Tuning Hypermarameters
    # What should be the value of k ? k = 1 means you depend too much on a potentially outlier neighbour
    # If k is all the neighbours then for every prediction you will get the same answer.

    # Look for the best value for k in the range of 1-50
    # parameters = {"n_neighbors": range(1, 50)}
    # gridsearch = GridSearchCV(KNeighborsRegressor(), parameters)
    # gscv = gridsearch.fit(X_train, y_train)
    # # print(gscv)
    # print(gridsearch.best_params_)  # {'n_neighbors': 17}

    # train_preds_grid = gridsearch.predict(X_train)
    # train_mse = mean_squared_error(y_train, train_preds_grid)
    # train_rmse = sqrt(train_mse)
    # test_preds_grid = gridsearch.predict(X_test)
    # test_mse = mean_squared_error(y_test, test_preds_grid)
    # test_rmse = sqrt(test_mse)
    # print(train_rmse)
    # print(test_rmse)


    # Weighted Average of Neighbors Based on Distance

    parameters = {
        "n_neighbors": range(1, 50),
        "weights": ["uniform", "distance"],
    }
    gridsearch = GridSearchCV(KNeighborsRegressor(), parameters)
    gridsearch.fit(X_train, y_train)
    # print(gridsearch.best_params_)  # {'n_neighbors': 17}
    # train_preds_grid = gridsearch.predict(X_train)
    # train_mse = mean_squared_error(y_train, train_preds_grid)
    # train_rmse = sqrt(train_mse)
    # test_preds_grid = gridsearch.predict(X_test)
    # test_mse = mean_squared_error(y_test, test_preds_grid)
    # test_rmse = sqrt(test_mse)
    # print(train_rmse)
    # print(test_rmse)

    best_k = gridsearch.best_params_["n_neighbors"]
    best_weights = gridsearch.best_params_["weights"]
    bagged_knn = KNeighborsRegressor(
        n_neighbors=best_k, weights=best_weights
    )

    bagging_model = BaggingRegressor(bagged_knn, n_estimators=100)
    bagging_model.fit(X_train, y_train)
    train_preds_grid = bagging_model.predict(X_train)
    train_mse = mean_squared_error(y_train, train_preds_grid)
    train_rmse = sqrt(train_mse)
    test_preds_grid = bagging_model.predict(X_test)
    test_mse = mean_squared_error(y_test, test_preds_grid)
    test_rmse = sqrt(test_mse)
    print(train_rmse)
    print(test_rmse)