机器学习算法 Python

您所在的位置:网站首页 随机森林feature_importances 机器学习算法 Python

机器学习算法 Python

2023-03-15 09:09| 来源: 网络整理| 查看: 265

随机森林回归 预测选择重要变量 重要性排序其他回归 预测代码

使用比较简单,直接调用库就好了。

教程目录 - sklearn中文文档sklearn 快速入门教程 - 郭峰g - 博客园Python之Sklearn使用入门教程python脚本之家

代码 D:\00000py\1\mcm 2020bq3_0008own.py# 定义模型 训练models_str = [‘LinearRegression’, # 普通最小二乘法 (线性回归) ‘MLPRegressor’, # 多层感知器(MLP) (神经网络) # 训练异常 可能本身有问题 ‘DecisionTree’, # 决策树 ‘SVR’, # 支持向量机 ‘GBDT’, # 梯度提升决策树 ‘lightGBM’, # 轻型梯度提升机 ‘RandomForest’ # 随机森林 _]

2020B 0027:XGBoost LightGBM 随机森林 SVM BP神经网络2020B 0089:随机森林(用于确定重要性)2020B 0036:随机森林(用于确定重要性/回归) GBR(GBDT)2020B 0008:各模型对比 LR LightGBM GBDT SVR DT(DecisionTree) 神经网络2020B 0116:随机森林 决策树 SVM GBDT2020B 0031:。。。

随机森林

回归 预测

文章 D:\00000MCM\0 研究生数学建模竞赛历年真题和优秀论文集锦\研究生数学建模-优秀论文\2020年优秀论文\B题\B20102470089.pdf

回归预测见后面代码

选择重要变量 重要性排序importances = models[6].feature_importances_ # 获取重要性 选择定义的模型print(importances)print(sorted(importances, reverse=True)) # 排序

其他

回归 预测 **'LinearRegression'**, _# 普通最小二乘法 (线性回归) _**'MLPRegressor'**, _# 多层感知器(MLP) (神经网络) # 训练异常 可能本身有问题 _**'DecisionTree'**, _# 决策树 _**'SVR'**, _# 支持向量机 _**'GBDT'**, _# 梯度提升决策树 _**'lightGBM'**, _# 轻型梯度提升机_

代码

D:\00000py\1\mcm 2020b_q3_0008own.py

import numpy as npimport matplotlib.pyplot as pltimport lightgbm as lgbfrom sklearn.linear_model import LinearRegressionfrom sklearn.svm import SVRfrom sklearn.neural_network import MLPRegressorfrom sklearn.tree import DecisionTreeRegressorfrom sklearn.ensemble import RandomForestRegressorfrom sklearn.ensemble import GradientBoostingRegressorfrom sklearn.metrics import mean_squared_error, mean_absolute_error, r2_scorefrom sklearn.model_selection import cross_val_score # 交叉验证from sklearn.datasets import load_irisimport timeimport csvimport refrom openpyxl import load_workbook # 读取xlsx 类型的文件需要专门的读取程序import xlrdimport warnings# filter warningswarnings.filterwarnings('ignore')# 正常显示中文from pylab import mplmpl.rcParams['font.sans-serif'] = ['SimHei']# 正常显示符号from matplotlib import rcParamsrcParams['axes.unicode_minus']=Falsedef normalization(data): # normalization 和normalization1 是对所有数据进行归一化,不太合理。而normalization2 可以对列进行归一化 _range = np.max(data) - np.min(data) return (data - np.min(data)) / _rangedef standardization(data): mu = np.mean(data, axis=0) sigma = np.std(data, axis=0) return (data - mu) / sigma# 如果归一化后的范围是[-1, 1]的话,可以将normalization()函数改为:def normalization1(data): _range = np.max(abs(data)) return data / _rangedef normalization2(data): minVals = data.min(0) maxVals = data.max(0) ranges = maxVals - minVals m = data.shape[0] normData = data - np.tile(minVals, (m, 1)) normData = normData/np.tile(ranges, (m, 1)) return normData, ranges, minValsdef draw(x_train_label, x_test_label, x_label, train_y, test_y, original_y, picture_path, model_name): plt.title(model_name, fontsize=22) plt.xlabel('样本编号') plt.ylabel('RON 损失值') # plt.xlim(xmax=9, xmin=0) # plt.ylim(ymax=9, ymin=0) # 画两条(0-9)的坐标轴并设置轴标签x,y colors1 = '#FFA500' # 点的颜色 '#C0504D'ori '训练集' 橘黄 colors2 = '#FF4500' #'#2894FF' '#00EEEE'ori '测试集' 橘红 colors3 = '#1E90FF' # '#FF6600'ori '原始数据' 浅蓝 # 画散点图 area1 = np.pi * 2 ** 2 # 点面积 area2 = np.pi * 3 ** 2 # 点面积 area3 = np.pi * 4 ** 2 # 点面积 # plt.scatter(x_train_label, train_y, marker='^', s=area2, c=colors1, alpha=1, label='训练集') # plt.scatter(x_test_label, test_y, marker='*', s=area3, c=colors2, alpha=1, label='测试集') # plt.scatter(x_label, original_y, marker='o', s=area1, c=colors3, alpha=1, label='原始数据') # # # plt.plot([0, 9.5], [9.5, 0], linewidth='0.5', color='#000000') # 画折线图 plt.plot(x_train_label, train_y, c=colors1, linewidth=0.9, label='训练集') plt.plot(x_test_label, test_y, c=colors2, linewidth=0.9, label='测试集') plt.plot(x_label, original_y, c=colors3, linewidth=0.9, label='原始数据') plt.legend() plt.savefig(picture_path, dpi=300) # 重新运行 保存的图像可能不刷新 plt.show()def p_words(string): string_list = re.findall(r"\d+\.\d+", string) return string_list[0]plt.rcParams['font.sans-serif'] = ['SimHei']plt.rcParams['axes.unicode_minus'] = False# matplotlib 画图中中文显示会有问题,需要这两行设置默认字体# 读取数据# all_data = []# with open("fina_30_samples.csv", "r", encoding="utf-8") as f: # 数据已放附录# # with open("325 个样本数据888.csv", "r", encoding="utf-8") as f:# # 遇到编码问题,可以考虑把xlsx 文件转化为utf-8 格式的csv 文件# f_csv = csv.reader(f)# for row in f_csv:# row = [float(p_words(item)) for item in row]# # 在进行正则匹配时,一定要先把csv 里面存储的数据转化为数值格式,并设置小数点# all_data.append(row)# 读取数据 转为ndarraydata = xlrd.open_workbook(r'D:\00000MCM\0 codes\2020B\q3forml.xlsx') # datas q2xxhgtable = data.sheet_by_index(0) #按索引获取工作表,0就是工作表1all_data = []for i in range(1, table.nrows): #table.nrows表示总行数 去除第一行 line=table.row_values(i) #读取每行数据,保存在line里面,line是list all_data.append(line) #将line加入到resArray中,resArray是二维listall_data = np.array(all_data) #将resArray从二维list变成数组# 分割数据data_label = all_data[:, 0] # 第0列 数据标签 1-n 用于画图target = all_data[:, -1] # 最后一列 目标值# target = normalization(all_data[:, -1])data, _, _ = normalization2(all_data[:, 1:-1]) # normalization 和normalization1 是对所有数据进行归一化,不太合理。而normalization2 可以对列进行归一化print(data[0])print(len(data[0]))print(target[0])print()# 分割训练集 测试集train_size = int(len(data)*0.7)x_train = data[:train_size]x_test = data[train_size:]y_train = target[:train_size]y_test = target[train_size:]x_train_label = data_label[:train_size]x_test_label = data_label[train_size:]# x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=20)# x_train_label, x_test_label = train_test_split(data_label, test_size=0.3, random_state=20)x_label = data_label# 定义模型 训练models_str = ['LinearRegression', # 普通最小二乘法 (线性回归) 'MLPRegressor', # 多层感知器(MLP) (神经网络) # 训练异常 可能是输出y数据没有归一化 可能本身有问题 'DecisionTree', # 决策树 'SVR', # 支持向量机 'GBDT', # 梯度提升决策树 'lightGBM', # 轻型梯度提升机 'RandomForest' # 随机森林 ]models = [LinearRegression(normalize=True), MLPRegressor(alpha=0.01), # MLPRegressor(hidden_layer_sizes=50, solver='sgd', alpha=0.0001, learning_rate_init=0.0003) DecisionTreeRegressor(), SVR(), GradientBoostingRegressor(), lgb.LGBMRegressor(objective='regression', num_leaves=31, learning_rate=0.05, n_estimators=20), RandomForestRegressor() ]MSE_lists = []MAE_lists = []R2_lists = []Run_Time_lists = []cv_jiaocha_flag = False # 交叉验证开关 True Falsen_folds = 5 # 交叉验证Cross_val_lists = []for name, model in zip(models_str, models): print('开始训练模型:' + name) model = model # 建立模型 model.fit(x_train, y_train) # 训练 startTime = time.time() y_train_pred = model.predict(x_train) # 预测 y_test_pred = model.predict(x_test) # 预测 stopTime = time.time() if cv_jiaocha_flag: # 交叉验证 Cross_val = cross_val_score(model, x_train, y_train, cv=n_folds) # 交叉验证 if name == 'LinearRegression': # 输出线性回归系数 print('截距: ',end=''); print(model.intercept_) #截距 print('回归系数: ',end=''); print(model.coef_) #回归系数 # 可视化 图片保存 # save_path = ".\\\\画图数据\\" + name + ".tif" save_path = '.\\datasets\\ml\\' + name + '.jpg' # 重新运行 保存的图像可能不刷新 draw(x_train_label, x_test_label, x_label, y_train_pred, y_test_pred, target, save_path, name) # 指标评估 MSE = mean_squared_error(y_test, y_test_pred) ** 0.5 MAE = mean_absolute_error(y_test, y_test_pred) R2 = r2_score(y_test, y_test_pred) Run_Time = stopTime - startTime MSE_lists.append(MSE) MAE_lists.append(MAE) R2_lists.append(R2) Run_Time_lists.append(Run_Time) if cv_jiaocha_flag: # 交叉验证 Cross_val_lists.append(Cross_val) print('The rmse of prediction is:', MSE) print('The mae of prediction is:', MAE) print('The r2 of prediction is:', R2) print('The Run_Time of prediction is:', Run_Time) print()print('models: ', end='')print(models_str)print('MSE_lists: ', end='')print(MSE_lists)print('MAE_lists: ', end='')print(MAE_lists)print('R2_lists: ', end='')print(R2_lists)print('Run_Time_lists: ', end='')print(Run_Time_lists)if cv_jiaocha_flag: # 交叉验证 print('Cross_val_lists: ', end='') print(Cross_val_lists)print('finish')importances = models[6].feature_importances_ # 获取重要性 选择定义的模型print(importances)print(sorted(importances, reverse=True)) # 排序


【本文地址】


今日新闻


推荐新闻


CopyRight 2018-2019 办公设备维修网 版权所有 豫ICP备15022753号-3