k近邻法(k-NN)|第11天

Steps

Datasets

Code

第1步：数据预处理

#导入相关库
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
#导入数据集
dataset = pd.read_csv('Social_Network_Ads.csv')
X = dataset.iloc[:,[2,3]].values
Y = dataset.iloc[:,4].values
#将数据集划分成训练集和测试集
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.25,random_state = 0)
#特征缩放
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

第2步：使用K-NN对训练集数据进行训练

#使用K-NN对训练集数据进行训练
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5,metric = 'minkowski',p = 2)
#KNeighborsClassifier(n_neighbors=5, weights=’uniform’, algorithm=’auto’, leaf_size=30, p=2, metric=’minkowski’, metric_params=None, n_jobs=None, **kwargs)
#n_neighbors:就是选取最近的点的个数K
#weights:K个近邻样本的权重
#algorithm:使用的算法
#leaf_size:停止建子树的叶子节点阈值
#p:距离度量附属参数，p=1为曼哈顿距离， p=2为欧式距离，默认为2
#metric:距离度量，默认为闵可夫斯基距离，与p共同决定
#metric_params:其他附属参数。主要是用于带权重闵可夫斯基距离的权重，以及其他一些比较复杂的距离度量的参数
#n_jobs:并行处理任务数,主要用于多核CPU时的并行处理，加快建立KNN树和预测搜索的速度
classifier.fit(X_train,Y_train)

第3步：预测

1 2	#对测试集进行预测 Y_pred = classifier.predict(X_test)

第四步：评估预测

1
2
3

#生成混淆矩阵
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test,Y_pred)

训练集可视化

from matplotlib.colors import ListedColormap
X_set,Y_set = X_train,Y_train
X1,X2 = np.meshgrid(np.arange(start = X_set[:,0].min() - 1,stop = X_set[:,0].max() + 1,step = 0.01),np.arange(start = X_set[:,1].min() - 1,stop = X_set[:,1].max() + 1,step = 0.01))
plt.contourf(X1,X2,classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),alpha = 0.75,cmap = ListedColormap(('red','green')))
plt.xlim(X1.min(),X1.max())
plt.ylim(X2.min(),X2.max())
for i,j in enumerate(np.unique(Y_set)):
    plt.scatter(X_set[Y_set == j,0],X_set[Y_set == j,1],c = ListedColormap(('black','white'))(i),label = j)
plt.title('K Nearest Neighbours(Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

测试集可视化

X_set,Y_set=X_test,Y_pred
X1,X2 = np.meshgrid(np.arange(start = X_set[:,0].min() - 1,stop = X_set[:,0].max() + 1,step = 0.01),np.arange(start = X_set[:,1].min() - 1,stop = X_set[:,1].max() + 1,step = 0.01))
plt.contourf(X1,X2,classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),alpha = 0.75,cmap = ListedColormap(('red','green')))
plt.xlim(X1.min(),X1.max())
plt.ylim(X2.min(),X2.max())
for i,j in enumerate(np.unique(Y_set)):
    plt.scatter(X_set[Y_set == j,0],X_set[Y_set == j,1],c = ListedColormap(('black','white'))(i),label = j)
plt.title('K Nearest Neighbours(Test set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()