Sklearn

安装:pip install -U scikit-learn

datasets

自带数据集

from sklearn import datasets boston = datasets.load_boston() # 波士顿房价,506条,13维,回归,`load_boston` has been removed from scikit-learn since version 1.2. iris = datasets.load_iris() # 鸢尾花,150条,4维,三分类 breast_cancer = datasets.load_breast_cancer() # 乳腺癌,568条、30维,二分类 diabetes = datasets.load_diabetes() # 糖尿病,442条、10维,回归 digits = datasets.load_digits() # 手写数字,1797条、64维,图片多分类 wine = datasets.load_wine() # 红酒,178条,13维,三分类 print(iris.keys()) # 查看键(属性) ['data','target','feature_names','DESCR', 'filename'] print(iris.data.shape,boston.target.shape) # 查看数据的形状 (150, 4) (150,) print(iris.feature_names) # 查看有哪些特征 这里共4种 print(iris.DESCR) # described 描述这个数据集的信息 print(iris.filename) # 文件路径 data = iris.data # 返回的是ndarray类型数据 target = iris.target # 返回的是ndarray类型数据

样本生成器

from sklearn import datasets """生成簇""" x, y = datasets.make_blobs(n_samples=1000, n_features=2, centers=4,cluster_std=1,random_state=0) # n_samples:样本数;n_features:特征数(维度);centers:中心数,也可以是中心的坐标;cluster_std:簇的方差 """同心圆""" x, y = datasets.make_circles(n_samples=5000, noise=0.04, factor=0.7,random_state=0) # noise:噪声;factor:内圆与外圆的距离 为1的时候最小 值在0~1之间 不能为0或1 """"月牙""" x, y = datasets.make_moons(n_samples=3000, noise=0.05,random_state=0) """分类数据""" x, y =datasets.make_classification(n_classes=4, n_samples=1000, n_features=2, n_informative=2 , n_redundant=0, n_clusters_per_class=1,n_repeated=0, random_state=0) # n_classes:类的数目 # n_informative:有效的特征数 # n_redundant:冗余特征数 有效特征数的线性组合 # n_repeated:有效特征数和冗余特征数的有效组合 # n_informative + n_redundant + n_repeated < = n_features # n_clusters_per_class:每一类的簇数 # n_classes * n_clusters_per_class <= 2**n_informative

Metrics

pairwise

pairwise_distances:成对距离

from sklearn.metrics.pairwise import pairwise_distances """参数详解 pairwise_distances(X, Y=None, metric=’euclidean’, n_jobs=None, **kwds) # 计算数组或矩阵之间的成对距离(两两样本之间的距离) # 如果只输入X: # 如果X是数组,计算素组中的每个向量与其他向量的距离。 # 如果X是矩阵,计算矩阵每一行与其他行的成对距离(矩阵的每一行也相当于一个向量)。 # 如果输入X和Y,则计算X和Y的距离矩阵(X中的每个向量与Y中的每个向量的距离) # metric:计算距离的方式;可选:['cityblock','cosine','euclidean','l1','l2','manhattan'] # 'manhattan'、'cityblock'、'l1':曼哈顿距离(也叫城市街区距离) # 'euclidean'、'l2':欧式距离 # 'cosine':余弦距离 # 各种距离公式详见 # 输出len(X) * len(Y)形状的矩阵 """ a=[[1,3],[2,2]] b=[[1,3],[2,2],[1,1]] pairwise_distances(a,Y=b,metric="euclidean") """ array([[0. , 1.41421356, 2. ], [1.41421356, 0. , 1.41421356]]) """

评分方法

from sklearn.metrics import accuracy_score # 准确率 from sklearn.metrics import precision_score # 精确率 from sklearn.metrics import recall_score # 召回率 from sklearn.metrics import f1_score # f1 from sklearn.metrics import auc # 根据输入的不同,可以计算AUC-ROC,也可以计算AUC-PR from sklearn.metrics import roc_curve # ROC 曲线 from sklearn.metrics import roc_auc_score # AUC-ROC from sklearn.metrics import precision_recall_curve # PR 曲线 from sklearn.metrics import average_precision_score # AUC-PR y_true = np.array([0, 0, 1, 1]) y_pred = np.array([0.1, 0.4, 0.35, 0.8]) y_lable = (y_pred>=0.5)*1 # 将概率转换成标签 print(y_lable) # [0 0 0 1] # AUC 计算方法一,先获取曲线返回值,再算 AUC fpr, tpr, thresholds = roc_curve(y_true, y_pred, pos_label=1) # ROC 曲线,返回fpr, tpr,阈值。pos_label:正样本标签,如果是{0,1}则这里是1 roc_auc = auc(fpr, tpr) precision, recall, thresholds = precision_recall_curve(y_true, y_pred) # pr曲线,返回精确率,召回率,阈值 pr_auc = auc(recall, precision) # AUC 计算方法二,直接计算(需要用到预测的概率) roc_auc = roc_auc_score(y_true, y_pred) # 计算的时候使用预测的概率 pr_auc = average_precision_score(y_true, y_pred) # 计算的时候使用预测的概率 a = accuracy_score(y_true, y_lable) # 准确率,需要将概率转换成标签 p = precision_score(y_true, y_lable) # 精确率,需要将概率转换成标签 r = recall_score(y_true, y_lable) # 召回率,需要将概率转换成标签 f1 = f1_score(y_true, y_lable) # f1值,需要将概率转换成标签 print('auc1:',auc1) print('auc2:',auc2) print('准确率:',a) print('精确率:',p) print('召回率:',r) print('f1值:',f1)

Model_selection

如果ShuffleSplit和train_test_split设置的随机种子相同,也就是random_state一样,那么ShuffleSplit的第一次切分方式与train_test_split完全一致

train_test_split:数据集切分(一次)

from sklearn.model_selection import train_test_split """参数详解 train_test_split(* arrays,** options ) train_test_split(data, tag, train_size, test_size, random_state, shuffle=True, stratify) - train_size: - 如果是小数,则表示训练数据的比例,如果不指定test_size,则用1-train_size作为test_size的参数; - 如果是整数,则表示获取整数个样本作为训练集,如果不指定test_size,则用所有样本数-train_size作为test_size的参数; - test_size:同上 - random_state:随机数种子;需要重复实验时,可以设置相同的随机数种子,保证数据相同; - shuffle:默认值为True,在拆分前打乱数据; - stratify:指定一个列,如果列中有多个类别,则对每个类别的数据取相同的比例(分层切分数据),一般按照标签的比例拆分所以一般为test_size """ X_train, X_test, y_train, y_test = train_test_split(data, tag, test_size=0.3) # 在使用时,如果没有标签数据,可以只放一个数据就行 train_data, test_data = train_test_split(data_small, test_size=0.3, stratify=data_small['userId'])

ShuffleSplit:数据集切分(多次随机划分)

from sklearn.model_selection import ShuffleSplit """参数详解 ShuffleSplit(n_splits=10, test_size=None, train_size=None, random_state=None) - n_splits:随机划分多少次,默认是10 - test_size:每一折的测试集样本多大 - train_size:每一折的训练集样本多大,一般设置测试集即可 - random_state:随机种子数,通常情况选定一个数就行 """ rs = ShuffleSplit(n_splits=10,test_size=0.3,random_state=1) for index in rs.split(df): X_train = df.loc[index[0],X.columns] y_train = df.loc[index[0],'y'] X_test = df.loc[index[1],X.columns] y_test = df.loc[index[1],'y']

KFold:数据集切分(交叉验证)

KFold通过参数n_splits把数据集分为互斥的n折,每次循环取一折作为测试集;如果shuffle是False,那么程序会按照原始数据集按索引顺序划分
from sklearn.model_selection import KFold """参数详解 KFold(n_splits=5, shuffle=False, random_state=None) - n_splits:划分多少折 - shuffle:是否打乱原始数据集的顺序,默认为False - random_state:随机种子数,只有在shuffle为True的时候起作用 """ kf = KFold(n_splits=2) for train_index, test_index in kf.split(X): print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index]

StratifiedKFold:数据集切分(分层交叉验证)

StratifiedKFold的参数跟KFold是一样的,但是它的功能是实现分层抽样
from sklearn.model_selection import StratifiedKFold # 参数详解,详见KFold StratifiedKFold(n_splits=5, shuffle=False, random_state=None) skf = StratifiedKFold(n_splits=2) for train_index, test_index in skf.split(X, y): print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index]

cross_value_score:模型选择(交叉验证)

from sklearn.model_selection import cross_val_score scores = cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=None, ...) """ - estimator:估计方法对象(分类器) - X:特征 - y:标签 - scoring:模型评分方法 - https://blog.csdn.net/qq_41076797/article/details/102755893 - https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter - cv:输入整型数字K就调用默认的(Stratified)KFold交叉验证,也可自定义 """ from sklearn import datasets from sklearn import svm from sklearn.model_selection import cross_val_score iris = datasets.load_iris() clf = svm.SVC(kernel='linear', C=1) scores = cross_val_score(clf, iris.data, iris.target, cv=5) print(scores)

cross_validate:模型选择(交叉验证)

cross_validate方法和cross_validate_score有个两个不同点:
  • 它允许传入多个评估方法,可以列表或字典的形式传入
  • 最后返回的scores为一个字典(cross_validate_score返回一个 array),字典的key为:dict_keys(['fit_time', 'score_time', 'test_score', 'train_score'])
    • fit_time:训练时间;score_time:评分的次数;test_score:测试集的准确率数组;train_score:训练集的准确率数组;train_scoretest_score的对比可以帮助判断模型是否存在过拟合或欠拟合等问题
from sklearn.model_selection import cross_validate scores = cross_validate(estimator, X, y=None, scoring=None, cv=None, n_jobs=None, ...) """ - estimator:估计方法对象(分类器) - X:特征 - y:标签 - scoring:模型评分方法,允许传入多个评估方法,可以列表或字典的形式传入 - https://blog.csdn.net/qq_41076797/article/details/102755893 - https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter - cv:输入整型数字K就调用默认的(Stratified)KFold交叉验证,也可自定义 """ from sklearn.model_selection import cross_validate from sklearn.svm import SVC from sklearn.datasets import load_iris iris = load_iris() scoring = ['precision_macro', 'recall_macro'] # scoring = ['precision','recall','f1','roc_auc','average_precision'] clf = SVC(kernel='linear', C=1, random_state=0) scores = cross_validate(clf, iris.data, iris.target, scoring=scoring,cv=5, return_train_score=False) print(scores.keys()) # 获取返回字典的key print(scores['test_recall_macro']) # 只获取测试集的召回率分数
💡
负均方误差:虽然均方误差永远为正,但是sklearn中的参数scoring下,均方误差作为评 判标准时,却是计算”负均方误差“(neg_mean_squared_error)。这是因为sklearn在计算模型评估指标的时候, 会考虑指标本身的性质,均方误差本身是一种误差,所以被sklearn划分为模型的一种损失(loss)。在sklearn当中, 所有的损失都使用负数表示,因此均方误差也被显示为负数了。真正的均方误差MSE的数值,其实就是 neg_mean_squared_error去掉负号的数字。
 

GridSearchCV:网格搜索

from sklearn.model_selection import GridSearchCV """ GridSearchCV(estimator, param_grid, *, scoring=None, n_jobs=None, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', error_score=nan, return_train_score=False) - estimator:所使用的分类器 - param_grid:值为字典或者列表,即需要最优化的参数的取值 - scoring:准确度评价标准,默认None. - 常用评分标准设置:‘accuracy’、‘precision’、‘recall’、‘f1’、‘roc_auc’ - scoring 设置参考: - https://blog.csdn.net/qq_41076797/article/details/102755893 - https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter - n_jobs:并行数,int,默认1,-1表示使用所有能用的核 - cv:交叉验证次数,int,默认5, """ from sklearn import svm, datasets from sklearn.model_selection import GridSearchCV iris = datasets.load_iris() parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]} svc = svm.SVC() grid_search = GridSearchCV(svc, parameters) grid_search.fit(iris.data, iris.target) grid_search.best_score_ # 获取最高分 grid_search.best_params_ # 获取被搜索的最好参数 grid_search.best_estimator_ # 获取最佳估计器(带所有参数的值)

feature_selection

VarianceThreshold:方差特征选择器

from sklearn.feature_selection import VarianceThreshold """ VarianceThreshold(threshold=0.0) - threshold:方差阈值。训练集方差低于此阈值的特征将被删除。默认设置是保留所有具有非零方差的特征,即删除所有样本中具有相同值的特征。 """ X = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]] selector = VarianceThreshold() selector.fit_transform(X) """" 方差阈值确认方法 1、经验或领域知识(领域内常用、论文推荐值等) 2、方差分布:绘制直方图或箱线图来可视化特征的方差分布 3、通过使用交叉验证来评估方差阈值的效果 """ # 使用绘图法确定阈值 df = df.apply(lambda x: x/x.mean()) # 对数据进行归一化,原数据可能每个特征的分布范围不一样 variances = np.var(df, axis=0) # var函数计算每个特征的方差,将返回一个包含每个特征方差的一维数组 plt.bar(range(len(variances)), variances) plt.xlabel('Feature Index') plt.ylabel('Variance') plt.title('Variance Distribution') plt.show() # 根据绘图结果选择合适的阈值

SelectKBest:特征选择器(按个数选择)

from sklearn.feature_selection import SelectKBest """ SelectKBest(score_func=<function f_classif>, *, k=10) - score_func:特征选择函数 - 分类问题: - chi2:卡方检验——要求数据非负 - f_classif:F检验 - mutual_info_classif:互信息 - 回归问题: - f_regression:F检验 - mutual_info_regression:互信息 - k:期望选择的特征个数 """ from sklearn.datasets import load_digits from sklearn.feature_selection import SelectKBest, chi2 X, y = load_digits(return_X_y=True) X.shape X_new = SelectKBest(chi2, k=20).fit_transform(X, y) # 使用卡方检验选择最好的20个特征 X_new.shape

SelectPercentile:特征选择器(按比例选择)

from sklearn.datasets import load_digits from sklearn.feature_selection import SelectPercentile, chi2 X, y = load_digits(return_X_y=True) X.shape X_new = SelectPercentile(chi2, percentile=10).fit_transform(X, y) # 使用卡方检验选择最好的10%的特征 X_new.shape

RFERFECV:特征选择器(递归特征消除法)

from sklearn.feature_selection import RFE from sklearn.feature_selection import RFECV # 含交叉验证 from sklearn.svm import SVR from sklearn.datasets import make_friedman1 """ RFECV(estimator, *, step=1, min_features_to_select=1, cv=None, scoring=None, verbose=0, n_jobs=None, importance_getter='auto') - estimator:监督学习估计器 - step: - 大于1:每次迭代要删除的特征个数 - 小于1:每次迭代要删除的特征比例 - min_features_to_select:最少要选择的特征数量 - cv:交叉验证折数 - scoring:评分函数,详见 - verbose:控制输出的详细程度 """ X, y = make_friedman1(n_samples=50, n_features=10, random_state=0) estimator = SVR(kernel="linear") selector = RFECV(estimator, step=1, cv=5) selector = selector.fit(X, y) selector.support_ # 所选特征的掩码,返回一个长度与原字段数相同的布尔值数组,对应值为1,则特征被保留,否则被删除 selector.ranking_ # 特征排序,ranking_[i] 对应第i个特征的排名位置

SelectFromModel特征选择器(嵌入法)

from sklearn.feature_selection import SelectFromModel from sklearn.linear_model import LogisticRegression """ SelectFromModel(estimator, *, threshold=None, prefit=False, norm_order=1, max_features=None, importance_getter='auto') - estimato:估计器 - threshold:选择特征的阈值 - max_features:要选择的最大特征数 """ from sklearn.feature_selection import SelectFromModel from sklearn.linear_model import LogisticRegression X = [[ 0.87, -1.34, 0.31 ], [-2.79, -0.02, -0.85 ], [-1.34, -0.48, -2.55 ], [ 1.92, 1.48, 0.65 ]] y = [0, 1, 0, 1] selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y) selector.estimator_.coef_ selector.threshold_ selector.get_support() selector.transform(X)

Feature_extraction

text

CountVectorizer:词频统计

from sklearn.feature_extraction.text import CountVectorizer """ # 参数 CountVectorizer(*, input='content') - input:string {‘filename’, ‘file’, ‘content’}, default=’content’ - filename:文件名列表 - file:文件名 - 否则,类型可以是string或byte。 # 属性 vocabulary_:返回词组成的字典 # 方法: fit() fit_transform() get_feature_names() get_stop_words() get_params() """ # 示例 corpus = ['This is the first document.','This document is the second document.','And this is the third one.','Is this the first document?'] vectorizer = CountVectorizer() X = vectorizer.fit_transform(corpus) # 获取字典,每个词有一个编号索引 print(sorted(vectorizer.vocabulary_.items(),key=lambda x: x[1],reverse=False)) # 将词袋转换成矩阵 X.toarray()

Externals

joblib:保存和加载模型&参数

from sklearn.ensemble import RandomForestClassifier from sklearn import datasets # sklearn在 0.23 之后的版本中就不包含joblib库了 from sklearn.externals import joblib # 旧版 import joblib # 新版 (X,y) = datasets.load_iris(return_X_y=True) rfc = RandomForestClassifier(n_estimators=100,max_depth=100) rfc.fit(X,y) print(rfc.predict(X[0:1,:])) joblib.dump(rfc, 'saved_model/rfc.pkl') #save model rfc2 = joblib.load('saved_model/rfc.pkl') #load model print(rfc2.predict(X[0:1,:]))

Compose

ColumnTransformer:特征变换方法

""" 功能:将数据中的指定的特征按照指定的方式进行转换 参数:参数很多,但一般只需要看前两个 ColumnTransformer(transformers, *, remainder='drop', sparse_threshold=0.3, n_jobs=None, transformer_weights=None, verbose=False, verbose_feature_names_out=True) - remainder:剩余部分的处理方式:drop删除,passthrough忽略,不做任何处理 - transformers:转换器,一个三元素元组,分别是转换器的名称、转换的方式、需要转换的列(组成的列表),例如:(名称,对象,列) """ import numpy as np import pandas as pd from sklearn.impute import SimpleImputer from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder # 独热编码 from sklearn.preprocessing import StandardScaler # 标准化 from sklearn.preprocessing import MinMaxScaler # 最大最小值归一化 from sklearn.preprocessing import KBinsDiscretizer # 离散化 from sklearn.preprocessing import LabelEncoder # 连续编号编码,只接受一维数据(在ColumnTransformer中使用会报错) from sklearn.preprocessing import OrdinalEncoder # 连续编号编码,接受二维及以上数据 data = { 'age': [21, 15, 22, 25, 26], 'gender': ['男', '女', '男', '男', '女'], 'major': ['计算机', '软件工程', '物理', '计算机', '数学'], 'score': [87, 38, 90, 95, 60], 'hobby': ['篮球', None, '足球', '乒乓球', '游泳'] } data = pd.DataFrame(data) ct = ColumnTransformer([ ('ordinal', OrdinalEncoder(), ['gender','hobby']), # 连续编号编码 ('onehot', OneHotEncoder(sparse=False), ['major']), # onehot ('discretizer', KBinsDiscretizer(n_bins=3), ['age']), # 离散化 ('scale', StandardScaler(), ['score']), # 标准化 ('num', SimpleImputer(strategy='median'), [0, 1]), # 填充缺失值 ],remainder='passthrough') # 剩余字段不做操作 ],remainder=StandardScaler()) # 剩余字段标准化 ct.fit_transform(data) """ array([[ 1. , 2. , 0. , 0. , 1. , 0. , 1. , 0. , 0. , 0.59862721], [ 0. , 4. , 0. , 0. , 0. , 1. , 1. , 0. , 0. , -1.65773689], [ 1. , 3. , 0. , 1. , 0. , 0. , 0. , 1. , 0. , 0.73677195], [ 1. , 0. , 0. , 0. , 1. , 0. , 0. , 0. , 1. , 0.96701319], [ 0. , 1. , 1. , 0. , 0. , 0. , 0. , 0. , 1. , -0.64467546]]) """

impute

SimpleImputer:缺失填充

""" SimpleImputer(*, missing_values=nan, strategy=‘mean’, fill_value=None, verbose=0, copy=True, add_indicator=False) - missing_values:缺失值类型:int, float, str, np.nan(默认)或是None - strategy:空值填充的策略:mean(均值)、median(中位数)、most_frequent(众数)、constant(自定义值,由fill_value的值指定) - fill_value:填充值,当strategy=="constant"时生效,默认为Zone(0) - copy:(默认)True,表示对数据的副本进行处理,False对数据原地修改。 """ import numpy as np from sklearn.impute import SimpleImputer # 缺失值填充 X1 = np.array([[1, 2, np.nan], [4, np.nan, 6], [np.nan, 8, 9]]) imp = SimpleImputer(missing_values=np.nan, strategy='mean') print(imp.fit_transform(X1))
 
If you have any questions, please contact me.