慕课网机器学习笔记(3)

内容提要


一个小注意点

jupyter notebook 在导入模块的时候,如果模块修改了需要重新导入,使用 importlib 模块的 reload 方法,当然,也可以选择重启python内核,不过如果之前输入的有出错会打断其执行


K临近算法

算法基本思想

选取离测试数据最近的k个样本,统计哪个样本类型的数量多,则测试数据有很大可能与该种类样本类型相同

对于kNN算法,训练集就是模型


使用sklearn中的鸢尾花进行测试

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from sklearn import datasets
import matplotlib.pyplot as plt
from math import sqrt
from collections import Counter
import numpy as np

iris = datasets.load_iris()

# 取后两个列进行测试
X = iris.data[:, 2:]
y=iris.target

new_point = np.array([2.1, 0.6])
k=3

# 绘制图像
plt.scatter(X[y==0, 0], X[y==0, 1], color="red", marker="o")
plt.scatter(X[y==1, 0], X[y==1, 1], color="green", marker="+")
plt.scatter(X[y==2, 0], X[y==2, 1], color="black", marker="*")
plt.scatter(new_point[0], new_point[1], color="blue")
plt.show()

# 计算每个数据距离测试点的距离
distant = [sqrt(np.sum((train_data-new_point)**2)) for train_data in X]

# 获取最近点对应的坐标
nearest = np.argsort(distant)

# 根据其坐标找出其对应的类型
result = [iris.target[i] for i in nearest]

# 使用方法most_common判断最接近的数值分布情况
result_type = Counter(result).most_common(1)[0][0]

对方法进行封装

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import numpy as np
from math import sqrt
from collections import Counter


def KNN_classify(k: int, x_train: np.ndarray, y_train: np.ndarray, x: np.ndarray) -> int:
assert 1 <= k <= x_train.shape[0], "k must be valid"
assert x_train.shape[0] == y_train.shape[0], \
"the size of x_train must equal to the size of y_train"
assert x_train.shape[1] == x.shape[0], \
"the feature number of x must be equal to x_train"

distant = [sqrt(np.sum((train_data - x) ** 2)) for train_data in x_train]
nearest = np.argsort(distant)
result = [y_train[i] for i in nearest]
return Counter(result).most_common(1)[0][0]

使用sklearn中的K临近算法库进行预测

1
2
3
4
5
6
7
8
from sklearn.neighbors import KNeighborsClassifier
# 生成一个实例
KNN_classifier = KNeighborsClassifier(n_neighbors=5)
# 生成模型
KNN_classifier.fit(X, y)
# 进行预测
KNN_classifier.predict(new_point.reshape(1, -1))
# array([0])

对自己写的算法进行sklearn风格的封装

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
class KNNClassifier:

def __init__(self, k: int):
"""初始化"""
assert k >= 1, "k must be valid"
self.k: int = k
self._x_train: np.ndarray = None
self._y_train: np.ndarray = None

def fit(self, x_train: np.ndarray,y_train: np.ndarray) -> 'KNNClassifier':
assert 1 <= self.k <= x_train.shape[0], "k must be valid"
assert x_train.shape[0] == y_train.shape[0], \
"the size of x_train must equal to the size of y_train"

self._x_train = x_train
self._y_train = y_train
return self

def predict(self, x_predict: np.ndarray) -> np.ndarray:
"""给定预测数据集,返回预测结果"""
assert self._x_train is not None and self._y_train is not None, \
"must fit before predict"
assert x_predict.shape[1] == self._x_train.shape[1], \
"the feature number of x must be equal to x_train"

y_predict: list = [self._predict(x) for x in x_predict]
return np.array(y_predict)

def _predict(self, x: np.ndarray) -> int:
""" 给定单个待预测的数据x,返回x的预测结果值 """
assert self._x_train.shape[1] == x.shape[0], \
"the feature number of x must be equal to x_train"

distant = [sqrt(np.sum((train_data - x) ** 2)) for train_data in self._x_train]
nearest = np.argsort(distant)
result = [self._y_train[i] for i in nearest[:self.k]]
return Counter(result).most_common(1)[0][0]

def __repr__(self):
return "KNN(k=%d)" % self.k

对提供的训练数据进行拆分

使用自己的写的方法对数据进行拆分

在输入的数据中抽出一部分数据作为训练的数据,而另一部分作为测试数据

使用自己的写的方法对数据进行拆分

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import numpy as np


def train_test_split(x_train: np.ndarray, y_train: np.ndarray, test_ratio=0.2, seed=None):
"""将数据X和y按照test_ratio分割成new_x_train, x_test, new_y_train, y_test"""
assert x_train.shape[0] == y_train.shape[0], \
"the size of x_train must be equal to the size of y_train"
assert 0.0 <= test_ratio <= 1.0, \
"test_train must be valid"

if seed:
np.random.seed(seed)

# 对矩阵的索引顺序进行打乱操作
shuffle_indexes = np.random.permutation(len(x_train))

test_size = int(len(x_train) * test_ratio)
test_indexes = shuffle_indexes[:test_size]
train_indexes = shuffle_indexes[test_size:]

# 根据打乱的索引取出相对应的数值
new_x_train = x_train[train_indexes]
new_y_train = y_train[train_indexes]

x_test = x_train[test_indexes]
y_test = y_train[test_indexes]

return new_x_train, x_test, new_y_train, y_test

使用sklearn提供的方法对其进行拆分

1
2
from sklearn.model_selection import  train_test_split
new_x_train, x_test, new_y_train, y_test = train_test_split(X, y, test_size=0.2)

使用K临近算法对手写体进行预测

初次查看

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn import datasets

# 载入手写体数据
digits = datasets.load_digits

# 查看含有的数值:dict_keys(['data', 'target', 'target_names', 'images', 'DESCR'])
digits.keys()

# 查看一个图片
some_digit = digits.data[666]
digits.target[666]
some_digit_image = some_digit.reshape(8,8)
plt.imshow(some_digit_image,cmap=matplotlib.cm.binary)
plt.show()


使用自己写的KNN对其进行训练和预判

1
2
3
4
5
6
7
8
9
10
import pycharm_project.knn as knn
import pycharm_project.model_selection as ms

x_train, x_test, y_train, y_test = ms.train_test_split(digits.data, digits.target)
kc = knn.KNNClassifier(6)
kc.fit(x_train, y_train)
predict_result = kc.predict(x_test)

# 计算预测准确率 0.9860724233983287
sum(predict_result == y_test) / len(y_test)

对评估结果进行封装

1
2
3
4
5
6
7
8
9
""" metrics.py 对机器学习结果进行评估 """
import numpy as np


def accuracy_score(y_true: np.ndarray, y_predict: np.ndarray) -> float:
assert y_true.shape[0] == y_predict.shape[0], \
"the size of y_ture must equal to to y_predict"

return sum(y_true == y_predict) / len(y_predict)

为自己写的KNNClassifier类添加评估准确性的方法

有时并不关心预测结果,只是想查看正确率时,可以直接使用这个方法

1
2
3
4
5
6
7
#...

def score(self, x_test: np.ndarray, y_test: np.ndarray) -> float:
y_predict = self.predict(x_test)
return accuracy_score(y_test, y_predict)

#...


超参数概念

在算法运行前需要决定的参数,比较:模型参数,算法过程中学习的参数

KNN算法没有模型参数,k为典型的超参数

对k进行调参

先对1-11进行测试,但是如果最佳参数为10的话,可能就要对10-20再一次进行测试

1
2
3
4
5
6
7
8
9
10
11
12
best_score = 0.0
best_k = -1

for i in range(1, 11):
kc_tmp = KNeighborsClassifier(n_neighbors=i)
kc_tmp.fit(train_x, train_y)
score_tmp = kc_tmp.score(test_x, test_y)
if score_tmp > best_score:
best_score = score_tmp
best_k = i
print(best_score, best_k)
# 0.9916666666666667 4

寻找最佳评估路径方法

是否考虑点距离的权重

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
best_score = 0.0
best_k = -1
best_method = ""

# 是否引入权重
for m in ["distance", "uniform"]:
for i in range(1, 11):
kc_tmp = KNeighborsClassifier(n_neighbors=i, weights=m)
kc_tmp.fit(train_x, train_y)
score_tmp = kc_tmp.score(test_x, test_y)
if score_tmp > best_score:
best_score = score_tmp
best_k = i
best_method = m
print(best_score, best_k, best_method)
# 0.9916666666666667 4 uniform

对距离的定义

寻找最佳距离在1-5之间

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
%%time
best_score = 0.0
best_k = -1
best_p = -1
for i in range(1, 11):
for p in range(1,6):
# 对p进行设置
kc_tmp = KNeighborsClassifier(n_neighbors=i, weights="distance", p=p)
kc_tmp.fit(train_x, train_y)
score_tmp = kc_tmp.score(test_x, test_y)
if score_tmp > best_score:
best_score = score_tmp
best_k = i
best_p = p
print(best_score, best_k, best_p)
# 0.9888888888888889 3 2
# CPU times: user 14.5 s, sys: 21.1 ms, total: 14.5 s
# Wall time: 14.6 s