master

分支 (1)

管理

管理

master

avazu-ctr-prediction
/
ctr9.py

# import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgbm
import _pickle as pickle
import time
from lightgbm.sklearn import LGBMClassifier
# from sklearn.model_selection import GridSearchCV
from scipy.sparse import csr_matrix

import warnings

warnings.filterwarnings("ignore")

begin_time = time.time()


def timer(s=0):
    global begin_time
    if s == 1:
        begin_time = time.time()
    else:
        print(time.time() - begin_time)


X_train = pd.read_pickle("pkl/X_train")
# X_train = X_train.sample(n=2000000, random_state=0, axis=0)
print("X_train read")
X_train = csr_matrix(X_train)
print("CSR")
y_train = pd.read_pickle("pkl/y_train")
# y_train = y_train.sample(n=2000000, random_state=0, axis=0)

MAX_ROUNDS = 10000
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=3)


def get_n_estimators(params, X_train, y_train, early_stopping_rounds=10):
    lgbm_params = params.copy()
    lgbmtrain = lgbm.Dataset(X_train, y_train)
    cv_result = lgbm.cv(lgbm_params, lgbmtrain, num_boost_round=MAX_ROUNDS, nfold=3, metrics='binary_logloss',
                        early_stopping_rounds=early_stopping_rounds, seed=3)
    print('best n_estimators:', len(cv_result['binary_logloss-mean']))
    return len(cv_result['binary_logloss-mean'])

# 经过ctr3.py - ctr9.py 调优，得出以下参数
n_estimators_1 = 863
num_leaves = 63
min_child_samples = 20
subsample = 0.9
colsample_bytree = 0.3
n_estimators_2 = 9830

params = {'boosting_type': 'gbdt',
          'objective': 'binary',
          'n_jobs': -1,
          'learning_rate': 0.01,
          'n_estimators': n_estimators_2,
          'max_depth': 7,
          'num_leaves': num_leaves,
          'min_child_samples': min_child_samples,
          'max_bin': 127,  # 2^6,原始特征为整数，很少超过100
          'subsample': subsample,
          'bagging_freq': 1,
          'colsample_bytree': colsample_bytree,
          'verbose=': -1
          }

lg = LGBMClassifier(silent=False, **params)
# 训练gbdt模型
lg.fit(X_train, y_train)
# 保存模型
pickle.dump(lg, open("pkl/model_1", 'wb'))
X_test = pd.read_pickle("pkl/X_test")
# 对测试集进行预测
y_test_pred = lg.predict(X_test)
print(len(y_test_pred))
# 把预测结果与ID合并
test_id = pickle.load(open("pkl/test_id.pkl", 'rb'))
out_df = pd.DataFrame(columns=['click'], data=y_test_pred)
out_df = pd.concat([test_id, out_df], axis=1)
# 输出csv，提交kaggle官网， 最终得分 Private Score 0.43939
out_df.to_csv("data/pred_1", index=False)