Files
fun-rec/codes/WDL.py
2021-03-29 19:37:37 +08:00

201 lines
7.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import pandas as pd
import numpy as np
import warnings
import random, math, os
from tqdm import tqdm
from tensorflow.keras import *
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.keras.callbacks import *
import tensorflow.keras.backend as K
from tensorflow.keras.regularizers import l2, l1_l2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, LabelEncoder
# 读取数据,并将标签做简单的转换
def get_data():
COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
"marital_status", "occupation", "relationship", "race", "gender",
"capital_gain", "capital_loss", "hours_per_week", "native_country",
"income_bracket"]
df_train = pd.read_csv("./data/adult_train.csv", names=COLUMNS)
df_test = pd.read_csv("./data/adult_test.csv", names=COLUMNS)
df_train['income_label'] = (df_train["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
df_test['income_label'] = (df_test["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
return df_train, df_test
# 特征处理分为wide部分的特征处理和deep部分的特征处理
def data_process(df_train, df_test):
# 年龄特征离散化
age_groups = [0, 25, 65, 90]
age_labels = range(len(age_groups) - 1)
df_train['age_group'] = pd.cut(df_train['age'], age_groups, labels=age_labels)
df_test['age_group'] = pd.cut(df_test['age'], age_groups, labels=age_labels)
# wide部分的原始特征及交叉特征
wide_cols = ['workclass', 'education', 'marital_status', 'occupation', \
'relationship', 'race', 'gender', 'native_country', 'age_group']
x_cols = (['education', 'occupation'], ['native_country', 'occupation'])
# deep部分的特征分为两大类一类是数值特征(可以直接输入到网络中进行训练)
# 一类是类别特征(只能在embedding之后才能输入到模型中进行训练
embedding_cols = ['workclass', 'education', 'marital_status', 'occupation', \
'relationship', 'race', 'gender', 'native_country']
cont_cols = ['age', 'capital_gain', 'capital_loss', 'hours_per_week']
# 类别标签
target = 'income_label'
return df_train, df_test, wide_cols, x_cols, embedding_cols, cont_cols, target
def process_wide_feats(df_train, df_test, wide_cols, x_cols, target):
# 合并训练和测试数据,后续一起编码
df_train['IS_TRAIN'] = 1
df_test['IS_TRAIN'] = 0
df_wide = pd.concat([df_train, df_test])
# 选出wide部分特征中的类别特征, 类别特征在DataFrame中是object类型
categorical_columns = list(df_wide.select_dtypes(include=['object']).columns)
# 构造交叉特征
crossed_columns_d = []
for f1, f2 in x_cols:
col_name = f1 + '_' + f2
crossed_columns_d.append(col_name)
df_wide[col_name] = df_wide[[f1, f2]].apply(lambda x: '-'.join(x), axis=1)
# wide部分的所有特征
wide_cols += crossed_columns_d
df_wide = df_wide[wide_cols + [target] + ['IS_TRAIN']]
# 将wide部分类别特征进行onehot编码
dummy_cols = [c for c in wide_cols if c in categorical_columns + crossed_columns_d]
df_wide = pd.get_dummies(df_wide, columns=[x for x in dummy_cols])
# 将训练数据和测试数据分离
train = df_wide[df_wide.IS_TRAIN == 1].drop('IS_TRAIN', axis=1)
test = df_wide[df_wide.IS_TRAIN == 0].drop('IS_TRAIN', axis=1)
cols = [c for c in train.columns if c != target]
X_train = train[cols].values
y_train = train[target].values.reshape(-1, 1)
X_test = test[cols].values
y_test = test[target].values.reshape(-1, 1)
return X_train, y_train, X_test, y_test
def process_deep_feats(df_train, df_test, embedding_cols, cont_cols, target, emb_dim=8, emb_reg=1e-3):
# 标记训练和测试集,方便特征处理完之后进行训练和测试集的分离
df_train['IS_TRAIN'] = 1
df_test['IS_TRAIN'] = 0
df_deep = pd.concat([df_train, df_test])
# 拼接数值特征和embedding特征
deep_cols = embedding_cols + cont_cols
df_deep = df_deep[deep_cols + [target,'IS_TRAIN']]
# 数值类特征进行标准化
scaler = StandardScaler()
df_deep[cont_cols] = pd.DataFrame(scaler.fit_transform(df_train[cont_cols]), columns=cont_cols)
# 类边特征编码
unique_vals = dict()
lbe = LabelEncoder()
for feats in embedding_cols:
df_deep[feats] = lbe.fit_transform(df_deep[feats])
unique_vals[feats] = df_deep[feats].nunique()
# 构造模型的输入层和embedding层虽然对于连续的特征没有embedding层但是为了统一将Reshape层
# 当成是连续特征的embedding层
inp_layer = []
emb_layer = []
for ec in embedding_cols:
layer_name = ec + '_inp'
inp = Input(shape=(1,), dtype='int64', name=layer_name)
emb = Embedding(unique_vals[ec], emb_dim, input_length=1, embeddings_regularizer=l2(emb_reg))(inp)
inp_layer.append(inp)
emb_layer.append(inp)
for cc in cont_cols:
layer_name = cc + '_inp'
inp = Input(shape=(1,), dtype='int64', name=layer_name)
emb = Reshape((1, 1))(inp)
inp_layer.append(inp)
emb_layer.append(inp)
# 训练和测试集分离
train = df_deep[df_deep.IS_TRAIN == 1].drop('IS_TRAIN', axis=1)
test = df_deep[df_deep.IS_TRAIN == 0].drop('IS_TRAIN', axis=1)
# 提取训练和测试集中的特征
X_train = [train[c] for c in deep_cols]
y_train = np.array(train[target].values).reshape(-1, 1)
X_test = [test[c] for c in deep_cols]
y_test = np.array(test[target].values).reshape(-1, 1)
return X_train, y_train, X_test, y_test, emb_layer, inp_layer
def wide_deep(df_train, df_test, wide_cols, x_cols, embedding_cols, cont_cols):
# wide部分特征处理
X_train_wide, y_train_wide, X_test_wide, y_test_wide = \
process_wide_feats(df_train, df_test, wide_cols, x_cols, target)
# deep部分特征处理
X_train_deep, y_train_deep, X_test_deep, y_test_deep, deep_inp_embed, deep_inp_layer = \
process_deep_feats(df_train, df_test, embedding_cols,cont_cols, target)
# wide特征与deep特征拼接
X_tr_wd = [X_train_wide] + X_train_deep
Y_tr_wd = y_train_deep # wide部分和deep部分的label是一样的
X_te_wd = [X_test_wide] + X_test_deep
Y_te_wd = y_test_deep # wide部分和deep部分的label是一样的
# wide部分的输入
w = Input(shape=(X_train_wide.shape[1],), dtype='float32', name='wide')
# deep部分的NN结构
d = concatenate(deep_inp_embed)
d = Flatten()(d)
d = Dense(50, activation='relu', kernel_regularizer=l1_l2(l1=0.01, l2=0.01))(d)
d = Dropout(0.5)(d)
d = Dense(20, activation='relu', name='deep')(d)
d = Dropout(0.5)(d)
# 将wide部分与deep部分的输入进行拼接, 然后输入一个线性层
wd_inp = concatenate([w, d])
wd_out = Dense(Y_tr_wd.shape[1], activation='sigmoid', name='wide_deep')(wd_inp)
# 构建模型这里需要注意模型的输入部分是由wide和deep部分组成的
wide_deep = Model(inputs=[w] + deep_inp_layer, outputs=wd_out)
wide_deep.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['AUC'])
# 设置模型学习率不设置学习率keras默认的学习率是0.01
wide_deep.optimizer.lr = 0.001
# 模型训练
wide_deep.fit(X_tr_wd, Y_tr_wd, epochs=5, batch_size=128)
# 模型预测及验证
results = wide_deep.evaluate(X_te_wd, Y_te_wd)
print("\n", results)
if __name__ == '__main__':
# 读取数据
df_train, df_test = get_data()
# 特征处理
df_train, df_test, wide_cols, x_cols, embedding_cols, cont_cols, target = data_process(df_train, df_test)
# 模型训练
wide_deep(df_train, df_test, wide_cols, x_cols, embedding_cols, cont_cols)