加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
ctr9.py 2.51 KB
一键复制 编辑 原始数据 按行查看 历史
pan 提交于 2020-12-03 16:47 . init
# import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgbm
import _pickle as pickle
import time
from lightgbm.sklearn import LGBMClassifier
# from sklearn.model_selection import GridSearchCV
from scipy.sparse import csr_matrix
import warnings
warnings.filterwarnings("ignore")
begin_time = time.time()
def timer(s=0):
global begin_time
if s == 1:
begin_time = time.time()
else:
print(time.time() - begin_time)
X_train = pd.read_pickle("pkl/X_train")
# X_train = X_train.sample(n=2000000, random_state=0, axis=0)
print("X_train read")
X_train = csr_matrix(X_train)
print("CSR")
y_train = pd.read_pickle("pkl/y_train")
# y_train = y_train.sample(n=2000000, random_state=0, axis=0)
MAX_ROUNDS = 10000
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=3)
def get_n_estimators(params, X_train, y_train, early_stopping_rounds=10):
lgbm_params = params.copy()
lgbmtrain = lgbm.Dataset(X_train, y_train)
cv_result = lgbm.cv(lgbm_params, lgbmtrain, num_boost_round=MAX_ROUNDS, nfold=3, metrics='binary_logloss',
early_stopping_rounds=early_stopping_rounds, seed=3)
print('best n_estimators:', len(cv_result['binary_logloss-mean']))
return len(cv_result['binary_logloss-mean'])
# 经过ctr3.py - ctr9.py 调优,得出以下参数
n_estimators_1 = 863
num_leaves = 63
min_child_samples = 20
subsample = 0.9
colsample_bytree = 0.3
n_estimators_2 = 9830
params = {'boosting_type': 'gbdt',
'objective': 'binary',
'n_jobs': -1,
'learning_rate': 0.01,
'n_estimators': n_estimators_2,
'max_depth': 7,
'num_leaves': num_leaves,
'min_child_samples': min_child_samples,
'max_bin': 127, # 2^6,原始特征为整数,很少超过100
'subsample': subsample,
'bagging_freq': 1,
'colsample_bytree': colsample_bytree,
'verbose=': -1
}
lg = LGBMClassifier(silent=False, **params)
# 训练gbdt模型
lg.fit(X_train, y_train)
# 保存模型
pickle.dump(lg, open("pkl/model_1", 'wb'))
X_test = pd.read_pickle("pkl/X_test")
# 对测试集进行预测
y_test_pred = lg.predict(X_test)
print(len(y_test_pred))
# 把预测结果与ID合并
test_id = pickle.load(open("pkl/test_id.pkl", 'rb'))
out_df = pd.DataFrame(columns=['click'], data=y_test_pred)
out_df = pd.concat([test_id, out_df], axis=1)
# 输出csv,提交kaggle官网, 最终得分 Private Score 0.43939
out_df.to_csv("data/pred_1", index=False)
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化