参考

安装

  • 参考莫烦的安装教程,或者直接安装annaconda的环境,网上搜索

选择学习方法

在这里插入图片描述

  • 数据小于50,需要寻找更多的数据
  • 监督学习
    • 分类学习
    • 线性回归学习,预测
  • 非监督学习
    • clustering 分堆
    • dimensionality reduction
      • 属性较多,将多属性压缩为更好的简单信息

通用学习模式

#!/usr/bin/env python3
# -*- coding:utf-8 -*-

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

iris = datasets.load_iris() # 获取鸢尾花数据集
iris_X = iris.data # 获取鸢尾花的数据
iris_y = iris.target # 获取鸢尾花的分类

# 查看数据
# print(iris_y)
# print(iris_X[:3, :])

# 将数据分为学习数据learning_data和测试数据training_data,在学习过程中不会互相影响,test_size 表示测试比例为30%
X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, test_size=0.3)
# train_test_split参数解释:
# train_data:所要划分的样本特征集
# train_target:所要划分的样本结果
# test_size:样本占比,如果是整数的话就是样本的数量
# random_state:是随机数的种子。

# 对比分开前后的数据,对数据样本进行了打乱
# print(iris_y)
# print(y_train)

# K-邻近算法
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

# 输出预测值和实际的值进行对比
print(knn.predict(X_test))
print(y_test)

sklearn数据库与自建数据

#!/usr/bin/env python3
# -*- coding:utf-8 -*-

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

# # 导入数据
# loaded_data = datasets.load_boston()
# data_X = loaded_data.data
# data_Y = loaded_data.target
#
# # 定义线性回归算法
# model = LinearRegression()
# model.fit(data_X, data_Y)

# 预测和实际值对比
# print(model.predict(data_X[:4, :]))
# print(data_Y[:4])

# 用sklearn的库来自己创建回归模型
X, y = datasets.make_regression(n_samples=100, n_features=1, n_targets=1, noise=10)
# 用plt展示
plt.scatter(X, y)
plt.show()

sklearn的常用属性和功能

#!/usr/bin/env python3
# -*- coding:utf-8 -*-

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

# 倒入数据
loaded_data = datasets.load_boston()
data_X = loaded_data.data
data_Y = loaded_data.target

# 定义线性回归算法
model = LinearRegression()
model.fit(data_X, data_Y)

# 预测和实际值对比
# print(model.predict(data_X[:4, :]))
# print(data_Y[:4])

# 用函数y = 0.1x + 0.3 来说明下述属性
# print(model.coef_) # 输出0.1
# print(model.intercept_) # 输出0.3

# 返回给model定义的参数,没有定义就是默认值
# print(model.get_params())

# 对预测结果打分
# print(model.score(data_X, data_Y)) # 使用 R^2 coefficient od determination 打分判断好坏

Normalization 正规化

#!/usr/bin/env python3
# -*- coding:utf-8 -*-
import numpy as np
from sklearn import preprocessing

a = np.random.randint(1, 100, [5, 5])
print(a)
print(preprocessing.scale(a))

# 正规化后使得数据近似相等,可以让machine learning更好的学习
# 输出
[[33 42 35 57 6]
[15 33 58 1 52]
[18 9 45 79 58]
[ 4 56 20 24 49]
[19 9 49 50 62]]
[[ 1.63753557 0.65954618 -0.49224944 0.54681462 -1.94906258]
[-0.30165129 0.17299572 1.27677199 -1.52221366 0.32649272]
[ 0.02154652 -1.12447218 0.27689031 1.35964715 0.62330428]
[-1.48670993 1.41640246 -1.64595907 -0.67243419 0.17808694]
[ 0.12927912 -1.12447218 0.58454621 0.28818608 0.82117865]]

#!/usr/bin/env python3
# -*- coding:utf-8 -*-

import numpy as np
from sklearn import preprocessing
from sklearn.datasets._samples_generator import make_classification
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import matplotlib.pyplot as plt

# 生成数据,300个样本,2个特征,
# random_state表示随机种子,可以复现随机数
# n_redundant 冗余特征数量
X, y = make_classification(n_samples=300, n_features=2, n_redundant=0, n_informative=2,
random_state=22, n_clusters_per_class=1, scale=100)
# 参数含义 https://blog.csdn.net/weixin_44225602/article/details/122726227

# 查看输入数据的图像
# plt.scatter(X[:, 0], X[:, 1], c=y)
# plt.show()

# 正规化
X = preprocessing.scale(X)
# X = preprocessing.minmax_scale(X, feature_range=(0, 1))

# 将数据区分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)

# 定义模型
clf = SVC()
# 训练模型
clf.fit(X_train, y_train)

# 用测试集和预测集做对比输出打分
print(clf.score(X_test, y_test))

交叉验证1

详细参考 https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation

下图是交叉验证的原理

#!/usr/bin/env python3
# -*- coding:utf-8 -*-
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

# 对于分类问题
# 导入数据
iris = load_iris()
X = iris.data
y = iris.target

# 区分数据
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)

# 定义k-临近算法
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# 输出预测值的得分
# y_pred = knn.predict(X_test)
# print(knn.score(X_test, y_test))

# 交叉验证
# scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy')
# 5组每一组的成绩
# print(sores)
# # 输出平均
# print(sores.mean())


# 通过一个个测试,来查看我的参数选取范围即好坏
k_range = range(1, 31)
k_scores = []
for k in k_range:
knn = KNeighborsClassifier(n_neighbors=k)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy') # for classification
# 如果是线性回归问题,则可以用误差打分,加 - 是因为生成的值是负值,误差要选小的
# loss = -cross_val_score(knn,X,y,cv=10,scoring='mean_squared_error') # for regression
scores.append(scores.mean())

plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
plt.show()

交叉验证2-过拟合

  • 多项式过多,过度拟合了当前的数据集
"""
Please note, this code is only for python 3+. If you are using python 2+, please modify the code accordingly.
"""
from __future__ import print_function
from sklearn.model_selection import learning_curve
from sklearn.datasets import load_digits
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import numpy as np

digits = load_digits()
X = digits.data
y = digits.target
train_sizes, train_loss, test_loss = learning_curve(
SVC(gamma=0.001), X, y, cv=10, scoring='neg_mean_squared_error',
train_sizes=[0.1, 0.25, 0.5, 0.75, 1])
train_loss_mean = -np.mean(train_loss, axis=1)
test_loss_mean = -np.mean(test_loss, axis=1)

# 构造图分析
plt.plot(train_sizes, train_loss_mean, 'o-', color="r",
label="Training")
plt.plot(train_sizes, test_loss_mean, 'o-', color="g",
label="Cross-validation")

plt.xlabel("Training examples")
plt.ylabel("Loss")
plt.legend(loc="best")
plt.show()
  • 没有过拟合

    在这里插入图片描述

  • 过拟合

    在这里插入图片描述

    在学习一段过程中,模型用test集测试反而误差变大了,说明我的模型训练发生的过拟合

交叉验证3

  • 如何在有过拟合的情况下,来选取没有过拟合的参数
from __future__ import print_function
from sklearn.model_selection import validation_curve
from sklearn.datasets import load_digits
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import numpy as np

# 获取数据
digits = load_digits()
X = digits.data
y = digits.target

# 参数变动的范围
param_range = np.logspace(-6, -2.3, 5) # 在-6到2.3,中取5个点做拟合
train_loss, test_loss = validation_curve(
SVC(), X, y, param_name='gamma', param_range=param_range, cv=10,
scoring='neg_mean_squared_error')
train_loss_mean = -np.mean(train_loss, axis=1)
test_loss_mean = -np.mean(test_loss, axis=1)

plt.plot(param_range, train_loss_mean, 'o-',color="r",
label="Training")
plt.plot(param_range, test_loss_mean, 'o-', color="g",
label="Cross-validation")

plt.xlabel("gamma")
plt.ylabel("Loss")
plt.legend(loc="best")
plt.show()

模型保存

  • 一点点pickle

    • 保存数据的模块
    • 字典、列表、变量的保存
    import pickle

    a_dict = {'da': 111, 2: [23,1,4], '23': {1:2,'d':'sad'}}

    # pickle a variable to a file
    file = open('pickle_example.pickle', 'wb')
    pickle.dump(a_dict, file)
    file.close()

    # reload a file to a variable
    with open('pickle_example.pickle', 'rb') as file:
    a_dict1 =pickle.load(file)

    print(a_dict1)
from __future__ import print_function
from sklearn import svm
from sklearn import datasets

clf = svm.SVC()
iris = datasets.load_iris()
X, y = iris.data, iris.target
clf.fit(X, y)

# method 1: pickle
import pickle

# save
with open('save/clf.pickle', 'wb') as f:
# 保存模型对象
pickle.dump(clf, f)
# restore
with open('save/clf.pickle', 'rb') as f:
# 输出模型对象
clf2 = pickle.load(f)
print(clf2.predict(X[0:1]))

# method 2: joblib
from sklearn.externals import joblib

# Save
joblib.dump(clf, 'save/clf.pkl')
# restore
clf3 = joblib.load('save/clf.pkl')
print(clf3.predict(X[0:1]))