加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
data_loader.py 8.37 KB
一键复制 编辑 原始数据 按行查看 历史
yangLiu 提交于 2022-09-08 14:48 . 首次提交
# coding=utf-8
"""
加载原始数据,并将categorical features转化成NN模型1的one-hot编码,将numerical features标准化
@author: yuhaitao
"""
import pandas as pd
import os
import tqdm
import numpy as np
import seaborn as sns
import json
import datetime
import multiprocessing
from sklearn.model_selection import KFold
def min_max_norm(x, feature_infos):
# deep部分进行min-max归一化
min_value = feature_infos[x.name]['min']
max_value = feature_infos[x.name]['max']
out = pd.Series(index=range(x.size))
index = 0
for one in x:
if one == max_value:
out[index] = 1.0
elif one == min_value:
out[index] = 0.0
else:
out[index] = (one - min_value) / (max_value - min_value)
index += 1
return out
def var_norm(x, feature_infos):
# 方差归一化
mean = feature_infos[x.name]['mean']
std = feature_infos[x.name]['std']
out = pd.Series(index=range(x.size))
index = 0
for one in x:
out[index] = (one - mean) / std
index += 1
return out
class myDataLoader(object):
"""
"""
def __init__(self, data_path):
"""
初始化
"""
self.data_path = data_path
def dataset_for_boost(self, train_file, test_file, label_file):
"""
加载数据集
"""
train_data = pd.read_csv(os.path.join(self.data_path, train_file))
test_data = pd.read_csv(os.path.join(self.data_path, test_file))
train_answer = pd.read_csv(os.path.join(self.data_path, label_file))
train_data = train_data.merge(train_answer, on='id', how='left')
# 处理缺失值,该数据集暂时没有
# 去掉数值全部相同的特征
singleValuesCnt = 0
for i in train_data.columns:
if len(train_data[i].unique()) == 1:
train_data.drop([i], axis=1, inplace=True)
test_data.drop([i], axis=1, inplace=True)
singleValuesCnt += 1
print("{}singleValues feathers are cleaned..".format(singleValuesCnt))
return train_data, test_data
def normalize_to_json(self, train_data):
"""
将训练集数据规范化后的均值,边界等指标存入json文件
"""
feature_infos = {}
# 随机采样80%的数据进行统计,模拟5-fold
train_data = train_data.sample(frac=0.8, replace=False, axis=0)
# 分开处理deep部分与wide部分
use_cols = [col for col in train_data.columns if col !=
'id' and 'p' not in col]
print(f'Number of common used features: {len(use_cols)}')
deep_cols, wide_cols = [], []
for col in use_cols:
if train_data[col].dtype == float:
deep_cols.append(col)
else:
wide_cols.append(col)
# 处理deep部分
def numeric_status(x):
return pd.Series([x.min(), x.mean(), x.max(), x.std(), ], index=['min', 'ave', 'max', 'std'])
deep_norm_df = train_data[deep_cols].apply(numeric_status)
for col in deep_cols:
c_max = min(
deep_norm_df[col][2], deep_norm_df[col][1] + deep_norm_df[col][3] * 3)
c_min = max(
deep_norm_df[col][0], deep_norm_df[col][1] - deep_norm_df[col][3] * 3)
feature_infos[col] = {'min': c_min, 'max': c_max,
'mean': deep_norm_df[col][1], 'std': deep_norm_df[col][3]}
# 处理wide部分
def categorical_status(x):
cat_dict = {}
for one in x:
if not pd.isnull(one):
if int(one) in cat_dict:
cat_dict[int(one)] += 1
else:
cat_dict[int(one)] = 1
cat_list = [tup[0] for tup in sorted(
cat_dict.items(), key=lambda x:x[1], reverse=True)[:min(len(cat_dict), 100)]]
# if 0 not in cat_list:
# cat_list.append(0)
cat_list.sort()
return pd.Series([cat_list], index=['list'])
wide_norm_df = train_data[wide_cols].apply(categorical_status)
for col in wide_cols:
feature_infos[col] = {'list': wide_norm_df[col][0]}
# 处理label(暂时没标准化)
label_cols = ['p1', 'p2', 'p3', 'p4', 'p5', 'p6']
label_norm_df = train_data[label_cols].apply(numeric_status)
for col in label_cols:
l_max = min(
label_norm_df[col][2], label_norm_df[col][1] + label_norm_df[col][3] * 3)
l_min = max(
label_norm_df[col][0], label_norm_df[col][1] - label_norm_df[col][3] * 3)
feature_infos[col] = {'min': l_min, 'max': l_max,
'mean': label_norm_df[col][1], 'std': label_norm_df[col][3]}
with open('./feature_info.json', 'w') as f:
f.write(json.dumps(feature_infos))
def prepare_nn_data(self, data, mode='train', norm_mode='min_max'):
"""
将数据规范化后存入csv文件
"""
with open('./feature_info.json', 'r') as f:
feature_infos = json.load(f)
nn_data_file = os.path.join(
self.data_path, f'nn_{mode}_{norm_mode}.csv')
use_cols, not_use_cols = [], []
for col in data.columns:
if col != 'id' and 'p' not in col:
use_cols.append(col)
else:
not_use_cols.append(col)
print(f'Number of common used features: {len(use_cols)}')
print('*' * 120)
# 划分wide 与 deep 不同部分的特征
deep_cols, wide_cols = [], []
for col in use_cols:
if data[col].dtype == float:
deep_cols.append(col)
else:
wide_cols.append(col)
# wide部分进行one-hot编码,通过align方法保持所有编码的维度与feature info中存储的信息一致
start_time = datetime.datetime.now()
print(f'Number of wide features: {len(wide_cols)}')
one_hot_list = []
for col in wide_cols:
for c in feature_infos[col]['list']:
one_hot_list.append(f'w{col}_{c}')
print(f'one hot dimension: {len(one_hot_list)}')
wide_df = pd.DataFrame(columns=one_hot_list)
# 生成当前数据集的one hot
one_hot_df = pd.get_dummies(data[wide_cols].astype(
str), prefix=['w' + col for col in wide_cols])
print(f'current one hot dimension: {len(one_hot_df.columns)}')
# 两个dataframe合并,以feature info中的维度为准
_, wide_df = wide_df.align(one_hot_df, join='left', axis=1, fill_value=0)
wide_df = wide_df.astype(np.float32)
end_time = datetime.datetime.now()
print(f'data processing cost time: {(end_time-start_time)}')
print(f'wide part dimension: {wide_df.shape}')
print('*' * 120)
print(wide_df.columns)
# deep部分
if norm_mode == 'min_max':
deep_df = data[deep_cols].apply(min_max_norm, args=(feature_infos,))
else:
deep_df = data[deep_cols].apply(var_norm, args=(feature_infos, ))
print(f'Number of deep features: {len(deep_cols)}')
print(f'deep part dimension: {deep_df.shape}')
print('*' * 120)
# 保存到文件
out_df = pd.DataFrame()
out_df = out_df.join(wide_df, how='right')
out_df = out_df.join(deep_df, how='right')
out_df = out_df.join(data[not_use_cols], how='right')
out_df.to_csv(nn_data_file, index=False)
def data_for_nn(self, train_file, test_file):
"""
直接读取已经标准化与one hot编码好的csv文件
"""
train_data = pd.read_csv(os.path.join(self.data_path, train_file))
test_data = pd.read_csv(os.path.join(self.data_path, test_file))
return train_data, test_data
if __name__ == '__main__':
# 加载数据
data_loader = myDataLoader('./data/molecule_open_data')
train_data, test_data = data_loader.dataset_for_boost(
train_file='candidate_train.csv', test_file='candidate_test_clean.csv', label_file='train_answer.csv')
# data_loader.normalize_to_json(train_data)
data_loader.prepare_nn_data(train_data, 'train', 'var') # 训练集
# data_loader.prepare_nn_data(test_data, 'test', 'var') # test A
data_loader.prepare_nn_data(test_data, 'new_test', 'var') # test B
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化