100字范文 > 数据科学和人工智能技术笔记十模型选择

数据科学和人工智能技术笔记十模型选择

时间：2022-05-30 23:52:06

相关推荐

数据科学和人工智能技术笔记十模型选择

十、模型选择

作者：Chris Albon
译者：飞龙
协议：CC BY-NC-SA 4.0

在模型选择期间寻找最佳预处理步骤

在进行模型选择时，我们必须小心正确处理预处理。首先，GridSearchCV使用交叉验证来确定哪个模型表现最好。然而，在交叉验证中，我们假装作为测试集被留出的一折是不可见的，因此不适合一些预处理步骤（例如缩放或标准化）。出于这个原因，我们无法预处理数据然后运行GridSearchCV。

其次，一些预处理方法有自己的参数，通常必须由用户提供。通过在搜索空间中包括候选成分值，可以像对待任何想要搜索其他超参数一样对待它们。

# 加载库import numpy as npfrom sklearn import datasetsfrom sklearn.feature_selection import SelectKBestfrom sklearn.linear_model import LogisticRegressionfrom sklearn.model_selection import GridSearchCVfrom sklearn.pipeline import Pipeline, FeatureUnionfrom sklearn.decomposition import PCAfrom sklearn.preprocessing import StandardScaler# 设置随机种子np.random.seed(0)# 加载数据iris = datasets.load_iris()X = iris.datay = iris.target

我们包括两个不同的预处理步骤：主成分分析和 k 最佳特征选择。

# 创建组合预处理对象preprocess = FeatureUnion([('pca', PCA()), ("kbest", SelectKBest(k=1))])# 创建流水线pipe = Pipeline([('preprocess', preprocess), ('classifier', LogisticRegression())])# 创建候选值空间search_space = [{'preprocess__pca__n_components': [1, 2, 3],'classifier__penalty': ['l1', 'l2'],'classifier__C': np.logspace(0, 4, 10)}]# 创建网格搜索clf = GridSearchCV(pipe, search_space, cv=5, verbose=0, n_jobs=-1)# 拟合网格搜索best_model = clf.fit(X, y)# 查看最佳超参数print('Best Number Of Princpal Components:', best_model.best_estimator_.get_params()['preprocess__pca__n_components'])print('Best Penalty:', best_model.best_estimator_.get_params()['classifier__penalty'])print('Best C:', best_model.best_estimator_.get_params()['classifier__C'])'''Best Number Of Princpal Components: 3Best Penalty: l1Best C: 59.9484250319 '''

使用网格搜索的超参数调优

# 加载库import numpy as npfrom sklearn import linear_model, datasetsfrom sklearn.model_selection import GridSearchCV# 加载数据iris = datasets.load_iris()X = iris.datay = iris.target# 创建逻辑回归logistic = linear_model.LogisticRegression()# 创建正则化惩罚空间penalty = ['l1', 'l2']# 创建正则化超参数空间C = np.logspace(0, 4, 10)# 创建超参数选项hyperparameters = dict(C=C, penalty=penalty)# 使用 5 折交叉验证创建网格搜索clf = GridSearchCV(logistic, hyperparameters, cv=5, verbose=0)# 拟合网格搜索best_model = clf.fit(X, y)# 查看最佳超参数print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])print('Best C:', best_model.best_estimator_.get_params()['C'])'''Best Penalty: l1Best C: 7.74263682681 '''# 预测目标向量best_model.predict(X)'''array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2,2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]) '''

使用随机搜索的超参数调优

# 加载库from scipy.stats import uniformfrom sklearn import linear_model, datasetsfrom sklearn.model_selection import RandomizedSearchCV# 加载数据iris = datasets.load_iris()X = iris.datay = iris.target# 创建逻辑回归logistic = linear_model.LogisticRegression()# 创建正则化惩罚空间penalty = ['l1', 'l2']# 使用均匀分布创建正则化超参数分布C = uniform(loc=0, scale=4)# 创建超参数选项hyperparameters = dict(C=C, penalty=penalty)# 使用 5 折交叉验证和 100 个迭代clf = RandomizedSearchCV(logistic, hyperparameters, random_state=1, n_iter=100, cv=5, verbose=0, n_jobs=-1)# 拟合随机搜索best_model = clf.fit(X, y)# 查看最佳超参数print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])print('Best C:', best_model.best_estimator_.get_params()['C'])'''Best Penalty: l1Best C: 1.66808801881 '''# 预测目标向量best_model.predict(X)'''array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2,2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]) '''

使用网格搜索的模型选择

# 加载库import numpy as npfrom sklearn import datasetsfrom sklearn.linear_model import LogisticRegressionfrom sklearn.ensemble import RandomForestClassifierfrom sklearn.model_selection import GridSearchCVfrom sklearn.pipeline import Pipeline# 设置随机种子np.random.seed(0)# 加载数据iris = datasets.load_iris()X = iris.datay = iris.target

请注意，我们包括需要搜索的多个可能的学习算法和多个可能的超参数值。

# 创建流水线pipe = Pipeline([('classifier', RandomForestClassifier())])# 创建候选学习算法和它们的超参数的空间search_space = [{'classifier': [LogisticRegression()],'classifier__penalty': ['l1', 'l2'],'classifier__C': np.logspace(0, 4, 10)},{'classifier': [RandomForestClassifier()],'classifier__n_estimators': [10, 100, 1000],'classifier__max_features': [1, 2, 3]}]# 创建网格搜索clf = GridSearchCV(pipe, search_space, cv=5, verbose=0)# 拟合网格搜索best_model = clf.fit(X, y)# 查看最佳模型best_model.best_estimator_.get_params()['classifier']'''LogisticRegression(C=7.7426368268112693, class_weight=None, dual=False,fit_intercept=True, intercept_scaling=1, max_iter=100,multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,solver='liblinear', tol=0.0001, verbose=0, warm_start=False) '''# 预测目标向量best_model.predict(X)'''array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2,2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]) '''

带有参数选项的流水线

# 导入所需的包import numpy as npfrom sklearn import linear_model, decomposition, datasetsfrom sklearn.pipeline import Pipelinefrom sklearn.model_selection import GridSearchCV, cross_val_scorefrom sklearn.preprocessing import StandardScaler# 加载乳腺癌数据集dataset = datasets.load_breast_cancer()# 从数据集特征中创建 XX = dataset.data# 从数据集目标中创建 yy = dataset.target# 创建缩放器对象sc = StandardScaler()# 创建 PCA 对象pca = decomposition.PCA()# 创建逻辑回归对象，带有 L2 惩罚logistic = linear_model.LogisticRegression()# 创建三步流水线。首先，标准化数据。# 其次，使用 PCA 转换数据。# 然后在数据上训练逻辑回归。pipe = Pipeline(steps=[('sc', sc), ('pca', pca), ('logistic', logistic)])# 创建 1 到 30 的一列整数（X + 1，特征序号）n_components = list(range(1,X.shape[1]+1,1))# 创建正则化参数的一列值C = np.logspace(-4, 4, 50)# 为正则化乘法创建一列选项penalty = ['l1', 'l2']# 为所有参数选项创建字典 # 注意，你可以使用 '__' 来访问流水线的步骤的参数parameters = dict(pca__n_components=n_components, logistic__C=C,logistic__penalty=penalty)# 创建网格搜索对象clf = GridSearchCV(pipe, parameters)# 拟合网格搜索clf.fit(X, y)# 查看超参数print('Best Penalty:', clf.best_estimator_.get_params()['logistic__penalty'])print('Best C:', clf.best_estimator_.get_params()['logistic__C'])print('Best Number Of Components:', clf.best_estimator_.get_params()['pca__n_components'])# 使用 3 折交叉验证拟合网格搜索cross_val_score(clf, X, y)

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。