ウミガメバイオインフォ

bioinfo/IT

RandomizedSearchCV

Python sklearn Python-sklearn

概要

グリッドサーチでのパラメータチューニングしか行ったことがなかったので試してみた。

環境

データの準備

import xgboost as xgb
from sklearn import datasets
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
import pandas as pd
import numpy as np
import inspect
from scipy.stats import pearsonr

data = load_boston()
x_train, x_test, y_train, y_test = train_test_split(data['data'], data['target'], test_size=0.2, random_state=0)

kf = KFold(n_splits=5, shuffle=False, random_state=0)

モデルの作成

params = {'objective': 'reg:squarederror'}
model = xgb.XGBRegressor()
model.set_params(**params)

クロスバリデーションの評価関数の定義

def cv_r_score(estimator, x, y):
    y_pred = estimator.predict(x)
    y = y.T[0]
    r, p = pearsonr(y, y_pred)
    return r

RandomizedSearchCV

params = {
    'max_depth':[5, 10, 15], 'min_child_weight': [1, 3, 6],'max_delta_step':[0, 4, 8],
    'gamma': [0.0, 0.1, 0.2],
    'subsample':[0.5, 0.7, 0.9], 'colsample_bytree':[0.5, 0.7, 0.9]
}
rmcv = RandomizedSearchCV(model, params, cv=kf, scoring=cv_r_score, n_iter=10)
rmcv.fit(x_train, y_train)

結果の出力

results = rmcv.cv_results_
for key in results.keys():
    if key == 'params':
        continue
    if 'param' in key:
        print(f"{'_'.join(key.split('_')[1:])}: {results[key]}")
print(results['mean_test_score'])

subsample: [0.9 0.9 0.5 0.5 0.9 0.7 0.5 0.7 0.9 0.7]
min_child_weight: [3 3 3 1 6 6 6 3 3 1]
max_depth: [10 10 10 10 15 5 15 5 5 15]
max_delta_step: [8 4 0 8 0 0 4 0 0 4]
gamma: [0.0 0.1 0.1 0.0 0.1 0.0 0.0 0.0 0.0 0.2]
colsample_bytree: [0.7 0.5 0.5 0.5 0.5 0.7 0.5 0.7 0.9 0.5]
[0.89627005 0.89432932 0.89878338 0.88887008 0.89917085 0.90628873
 0.8967743  0.90508957 0.90810282 0.88271599]