6月
27

慕课网机器学习笔记（3）

技术

内容提要

一个小注意点
K临近算法
- 算法基本思想
- 使用sklearn中的鸢尾花进行测试
- 对方法进行封装
- 使用sklearn中的K临近算法库进行预测
- 对自己写的算法进行sklearn风格的封装
- 对提供的训练数据进行拆分
  - 使用自己的写的方法对数据进行拆分
  - 使用sklearn提供的方法对其进行拆分
- 使用K临近算法对手写体进行预测
- 超参数概念

一个小注意点

jupyter notebook 在导入模块的时候，如果模块修改了需要重新导入，使用 importlib 模块的 reload 方法，当然，也可以选择重启python内核，不过如果之前输入的有出错会打断其执行

K临近算法

算法基本思想

选取离测试数据最近的k个样本，统计哪个样本类型的数量多，则测试数据有很大可能与该种类样本类型相同

对于kNN算法，训练集就是模型

使用sklearn中的鸢尾花进行测试

from sklearn import datasets
import matplotlib.pyplot as plt
from math import sqrt
from collections import Counter
import numpy as np

iris = datasets.load_iris()

# 取后两个列进行测试
X = iris.data[:, 2:]
y=iris.target

new_point = np.array([2.1, 0.6])
k=3

# 绘制图像
plt.scatter(X[y==0, 0], X[y==0, 1], color="red", marker="o")
plt.scatter(X[y==1, 0], X[y==1, 1], color="green", marker="+")
plt.scatter(X[y==2, 0], X[y==2, 1], color="black", marker="*")
plt.scatter(new_point[0], new_point[1], color="blue")
plt.show()

# 计算每个数据距离测试点的距离
distant = [sqrt(np.sum((train_data-new_point)**2)) for train_data in X]

# 获取最近点对应的坐标
nearest = np.argsort(distant)

# 根据其坐标找出其对应的类型
result = [iris.target[i] for i in nearest]

# 使用方法most_common判断最接近的数值分布情况
result_type = Counter(result).most_common(1)[0][0]

对方法进行封装

import numpy as np
from math import sqrt
from collections import Counter


def KNN_classify(k: int, x_train: np.ndarray, y_train: np.ndarray, x: np.ndarray) -> int:
    assert 1 <= k <= x_train.shape[0], "k must be valid"
    assert x_train.shape[0] == y_train.shape[0], \
        "the size of x_train must equal to the size of y_train"
    assert x_train.shape[1] == x.shape[0], \
        "the feature number of x must be equal to x_train"

    distant = [sqrt(np.sum((train_data - x) ** 2)) for train_data in x_train]
    nearest = np.argsort(distant)
    result = [y_train[i] for i in nearest]
    return Counter(result).most_common(1)[0][0]

使用sklearn中的K临近算法库进行预测

from sklearn.neighbors import KNeighborsClassifier
# 生成一个实例
KNN_classifier = KNeighborsClassifier(n_neighbors=5)
# 生成模型
KNN_classifier.fit(X, y)
# 进行预测
KNN_classifier.predict(new_point.reshape(1, -1))
# array([0])

对自己写的算法进行sklearn风格的封装

class KNNClassifier:

    def __init__(self, k: int):
        """初始化"""
        assert k >= 1, "k must be valid"
        self.k: int = k
        self._x_train: np.ndarray = None
        self._y_train: np.ndarray = None

    def fit(self, x_train: np.ndarray,y_train: np.ndarray) -> 'KNNClassifier':
        assert 1 <= self.k <= x_train.shape[0], "k must be valid"
        assert x_train.shape[0] == y_train.shape[0], \
            "the size of x_train must equal to the size of y_train"

        self._x_train = x_train
        self._y_train = y_train
        return self

    def predict(self, x_predict: np.ndarray) -> np.ndarray:
        """给定预测数据集，返回预测结果"""
        assert self._x_train is not None and self._y_train is not None, \
            "must fit before predict"
        assert x_predict.shape[1] == self._x_train.shape[1], \
            "the feature number of x must be equal to x_train"

        y_predict: list = [self._predict(x) for x in x_predict]
        return np.array(y_predict)

    def _predict(self, x: np.ndarray) -> int:
        """ 给定单个待预测的数据x，返回x的预测结果值 """
        assert self._x_train.shape[1] == x.shape[0], \
            "the feature number of x must be equal to x_train"

        distant = [sqrt(np.sum((train_data - x) ** 2)) for train_data in self._x_train]
        nearest = np.argsort(distant)
        result = [self._y_train[i] for i in nearest[:self.k]]
        return Counter(result).most_common(1)[0][0]

    def __repr__(self):
        return "KNN(k=%d)" % self.k

对提供的训练数据进行拆分

使用自己的写的方法对数据进行拆分

在输入的数据中抽出一部分数据作为训练的数据，而另一部分作为测试数据

使用自己的写的方法对数据进行拆分

import numpy as np


def train_test_split(x_train: np.ndarray, y_train: np.ndarray, test_ratio=0.2, seed=None):
    """将数据X和y按照test_ratio分割成new_x_train, x_test, new_y_train, y_test"""
    assert x_train.shape[0] == y_train.shape[0], \
        "the size of x_train must be equal to the size of y_train"
    assert 0.0 <= test_ratio <= 1.0, \
        "test_train must be valid"

    if seed:
        np.random.seed(seed)

    # 对矩阵的索引顺序进行打乱操作
    shuffle_indexes = np.random.permutation(len(x_train))

    test_size = int(len(x_train) * test_ratio)
    test_indexes = shuffle_indexes[:test_size]
    train_indexes = shuffle_indexes[test_size:]

    # 根据打乱的索引取出相对应的数值
    new_x_train = x_train[train_indexes]
    new_y_train = y_train[train_indexes]

    x_test = x_train[test_indexes]
    y_test = y_train[test_indexes]

    return new_x_train, x_test, new_y_train, y_test

使用sklearn提供的方法对其进行拆分

1 2	from sklearn.model_selection import train_test_split new_x_train, x_test, new_y_train, y_test = train_test_split(X, y, test_size=0.2)

使用K临近算法对手写体进行预测

初次查看

import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn import datasets

# 载入手写体数据
digits = datasets.load_digits

# 查看含有的数值：dict_keys(['data', 'target', 'target_names', 'images', 'DESCR'])
digits.keys()

# 查看一个图片
some_digit = digits.data[666]
digits.target[666]
some_digit_image = some_digit.reshape(8,8)
plt.imshow(some_digit_image,cmap=matplotlib.cm.binary)
plt.show()

使用自己写的KNN对其进行训练和预判

import pycharm_project.knn as knn
import pycharm_project.model_selection as ms

x_train, x_test, y_train, y_test = ms.train_test_split(digits.data, digits.target)
kc = knn.KNNClassifier(6)
kc.fit(x_train, y_train)
predict_result = kc.predict(x_test)

# 计算预测准确率 0.9860724233983287
sum(predict_result == y_test) / len(y_test)

对评估结果进行封装

""" metrics.py 对机器学习结果进行评估 """
import numpy as np


def accuracy_score(y_true: np.ndarray, y_predict: np.ndarray) -> float:
    assert y_true.shape[0] == y_predict.shape[0], \
        "the size of y_ture must equal to to y_predict"

    return sum(y_true == y_predict) / len(y_predict)

为自己写的KNNClassifier类添加评估准确性的方法

有时并不关心预测结果，只是想查看正确率时，可以直接使用这个方法

#...

def score(self, x_test: np.ndarray, y_test: np.ndarray) -> float:
    y_predict = self.predict(x_test)
    return accuracy_score(y_test, y_predict)

#...

超参数概念

在算法运行前需要决定的参数，比较：模型参数，算法过程中学习的参数

KNN算法没有模型参数，k为典型的超参数

对k进行调参

先对1-11进行测试，但是如果最佳参数为10的话，可能就要对10-20再一次进行测试

best_score = 0.0
best_k = -1

for i in range(1, 11):
    kc_tmp = KNeighborsClassifier(n_neighbors=i)
    kc_tmp.fit(train_x, train_y)
    score_tmp = kc_tmp.score(test_x, test_y)
    if score_tmp > best_score:
        best_score = score_tmp
        best_k = i
print(best_score, best_k)
# 0.9916666666666667 4

寻找最佳评估路径方法

是否考虑点距离的权重

best_score = 0.0
best_k = -1
best_method = ""

# 是否引入权重
for m in ["distance", "uniform"]:
    for i in range(1, 11):
        kc_tmp = KNeighborsClassifier(n_neighbors=i, weights=m)
        kc_tmp.fit(train_x, train_y)
        score_tmp = kc_tmp.score(test_x, test_y)
        if score_tmp > best_score:
            best_score = score_tmp
            best_k = i
            best_method = m
print(best_score, best_k, best_method)
# 0.9916666666666667 4 uniform

对距离的定义

寻找最佳距离在1-5之间

%%time
best_score = 0.0
best_k = -1
best_p = -1
for i in range(1, 11):
    for p in range(1,6):
    		# 对p进行设置
        kc_tmp = KNeighborsClassifier(n_neighbors=i, weights="distance", p=p)
        kc_tmp.fit(train_x, train_y)
        score_tmp = kc_tmp.score(test_x, test_y)
        if score_tmp > best_score:
            best_score = score_tmp
            best_k = i
            best_p = p
print(best_score, best_k, best_p)
# 0.9888888888888889 3 2
# CPU times: user 14.5 s, sys: 21.1 ms, total: 14.5 s
# Wall time: 14.6 s

Gitalking ...

Schwarzeni

内容提要

一个小注意点

K临近算法

算法基本思想

使用sklearn中的鸢尾花进行测试

对方法进行封装

使用sklearn中的K临近算法库进行预测

对自己写的算法进行sklearn风格的封装

对提供的训练数据进行拆分

使用自己的写的方法对数据进行拆分

使用自己的写的方法对数据进行拆分

使用sklearn提供的方法对其进行拆分

使用K临近算法对手写体进行预测

初次查看

使用自己写的KNN对其进行训练和预判

对评估结果进行封装

为自己写的KNNClassifier类添加评估准确性的方法

超参数概念

对k进行调参

寻找最佳评估路径方法

对距离的定义