Nestimators

您所在的位置:网站首页 estimaters Nestimators

Nestimators

2023-11-03 15:28| 来源: 网络整理| 查看: 265

N e s t i m a t o r s Nestimators Nestimators 导入所需工具包 from xgboost import XGBClassifier import xgboost as xgb import pandas as pd import numpy as np from sklearn.model_selection import GridSearchCV from sklearn.model_selection import StratifiedKFold from sklearn.metrics import log_loss from matplotlib import pyplot import seaborn as sns %matplotlib inline 读取数据 # path to where the data lies #dpath = '/Users/qing/desktop/XGBoost/data/' dpath = './data/' train = pd.read_csv(dpath +"Otto_train.csv") train.head()

在这里插入图片描述

Variable Identification

选择该数据集是因为的数据特征单一,我们可以在特征工程方面少做些工作,集中精力放在参数调优上

Target 分布,看看各类样本分布是否均衡 sns.countplot(train.target); pyplot.xlabel('target'); pyplot.ylabel('Number of occurrences');

在这里插入图片描述 每类样本分布不是很均匀,所以交叉验证时也考虑各类样本按比例抽取

# drop ids and get labels y_train = train['target'] y_train = y_train.map(lambda s: s[6:]) y_train = y_train.map(lambda s: int(s)-1) train = train.drop(["id", "target"], axis=1) X_train = np.array(train)

各类样本不均衡,交叉验证是采用StratifiedKFold,在每折采样时各类样本按比例采样

# prepare cross validation kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=3)

默认参数,此时学习率为0.1,比较大,观察弱分类数目的大致范围 (采用默认参数配置,看看模型是过拟合还是欠拟合)

def modelfit(alg, X_train, y_train, useTrainCV=True, cv_folds=None, early_stopping_rounds=50): if useTrainCV: xgb_param = alg.get_xgb_params() xgb_param['num_class'] = 9 xgtrain = xgb.DMatrix(X_train, label = y_train) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], folds =cv_folds, metrics='mlogloss', early_stopping_rounds=early_stopping_rounds) n_estimators = cvresult.shape[0] alg.set_params(n_estimators = n_estimators) print (cvresult) #result = pd.DataFrame(cvresult) #cv缺省返回结果为DataFrame #result.to_csv('my_preds.csv', index_label = 'n_estimators') cvresult.to_csv('my_preds_4_1.csv', index_label = 'n_estimators') # plot test_means = cvresult['test-mlogloss-mean'] test_stds = cvresult['test-mlogloss-std'] train_means = cvresult['train-mlogloss-mean'] train_stds = cvresult['train-mlogloss-std'] x_axis = range(0, n_estimators) pyplot.errorbar(x_axis, test_means, yerr=test_stds ,label='Test') pyplot.errorbar(x_axis, train_means, yerr=train_stds ,label='Train') pyplot.title("XGBoost n_estimators vs Log Loss") pyplot.xlabel( 'n_estimators' ) pyplot.ylabel( 'Log Loss' ) pyplot.savefig( 'n_estimators.png' ) #Fit the algorithm on the data alg.fit(X_train, y_train, eval_metric='mlogloss') #Predict training set: train_predprob = alg.predict_proba(X_train) logloss = log_loss(y_train, train_predprob) #Print model report: print ("logloss of train :" ) print (logloss) #params = {"objective": "multi:softprob", "eval_metric":"mlogloss", "num_class": 9} xgb1 = XGBClassifier( learning_rate =0.1, n_estimators=1000, #数值大没关系,cv会自动返回合适的n_estimators max_depth=5, min_child_weight=1, gamma=0, subsample=0.3, colsample_bytree=0.8, colsample_bylevel=0.7, objective= 'multi:softprob', seed=3) modelfit(xgb1, X_train, y_train, cv_folds = kfold) cvresult = pd.DataFrame.from_csv('my_preds_4_1.csv') # plot test_means = cvresult['test-mlogloss-mean'] test_stds = cvresult['test-mlogloss-std'] train_means = cvresult['train-mlogloss-mean'] train_stds = cvresult['train-mlogloss-std'] x_axis = range(0, cvresult.shape[0]) pyplot.errorbar(x_axis, test_means, yerr=test_stds ,label='Test') pyplot.errorbar(x_axis, train_means, yerr=train_stds ,label='Train') pyplot.title("XGBoost n_estimators vs Log Loss") pyplot.xlabel( 'n_estimators' ) pyplot.ylabel( 'Log Loss' ) pyplot.savefig( 'n_estimators4_1.png' ) pyplot.show()

在这里插入图片描述

cvresult = pd.DataFrame.from_csv('my_preds_4_1.csv') cvresult = cvresult.iloc[100:] # plot test_means = cvresult['test-mlogloss-mean'] test_stds = cvresult['test-mlogloss-std'] train_means = cvresult['train-mlogloss-mean'] train_stds = cvresult['train-mlogloss-std'] x_axis = range(100,cvresult.shape[0]+100) fig = pyplot.figure(figsize=(10, 10), dpi=100) pyplot.errorbar(x_axis, test_means, yerr=test_stds ,label='Test') pyplot.errorbar(x_axis, train_means, yerr=train_stds ,label='Train') pyplot.title("XGBoost n_estimators vs Log Loss") pyplot.xlabel( 'n_estimators' ) pyplot.ylabel( 'Log Loss' ) pyplot.savefig( 'n_estimators_detail.png' ) pyplot.show()

在这里插入图片描述



【本文地址】


今日新闻


推荐新闻


CopyRight 2018-2019 办公设备维修网 版权所有 豫ICP备15022753号-3