Files
fun-rec/codes/base_models/utils.py
2021-12-04 10:58:42 +08:00

52 lines
2.6 KiB
Python

from collections import namedtuple
# 使用具名元组定义特征标记
SparseFeat = namedtuple('SparseFeat', ['name', 'vocabulary_size', 'embedding_dim'])
DenseFeat = namedtuple('DenseFeat', ['name', 'dimension'])
VarLenSparseFeat = namedtuple('VarLenSparseFeat', ['name', 'vocabulary_size', 'embedding_dim', 'maxlen'])
#产生多任务学习模型的数据 工具函数
def get_mtl_data():
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from deepctr.feature_column import SparseFeat, DenseFeat,get_feature_names
CENSUS_COLUMNS = ['age','workclass','fnlwgt','education','education_num','marital_status','occupation','relationship','race','gender','capital_gain','capital_loss','hours_per_week','native_country','income_bracket']
df_train = pd.read_csv('./data/adult_train.csv',header=None,names=CENSUS_COLUMNS)
df_test = pd.read_csv('./data/adult_test.csv',header=None,names=CENSUS_COLUMNS)
data = pd.concat([df_train, df_test], axis=0)
#构造两个label
data['label_income'] = data['income_bracket'].map({' >50K.':1, ' >50K':1, ' <=50K.':0, ' <=50K':0})
data['label_marital'] = data['marital_status'].apply(lambda x: 1 if x==' Never-married' else 0)
data.drop(labels=['marital_status', 'income_bracket'], axis=1, inplace=True)
#构造dict输入
#define dense and sparse features
columns = data.columns.values.tolist()
dense_features = ['fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
sparse_features = [col for col in columns if col not in dense_features and col not in ['label_income', 'label_marital']]
data[sparse_features] = data[sparse_features].fillna('-1', )
data[dense_features] = data[dense_features].fillna(0, )
mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])
for feat in sparse_features:
lbe = LabelEncoder()
data[feat] = lbe.fit_transform(data[feat])
fixlen_feature_columns = [SparseFeat(feat, data[feat].max()+1, embedding_dim=16)for feat in sparse_features] \
+ [DenseFeat(feat, 1,) for feat in dense_features]
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(dnn_feature_columns)
n_train = df_train.shape[0]
train = data[:n_train]
test = data[n_train:]
train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}
y_list = [data['label_income'].values[:n_train], data['label_marital'].values[:n_train]]
return dnn_feature_columns, train_model_input, test_model_input, y_list