262 lines
12 KiB
Python
262 lines
12 KiB
Python
from collections import namedtuple
|
||
from tensorflow import keras
|
||
import pandas as pd
|
||
import numpy as np
|
||
from sklearn.preprocessing import LabelEncoder
|
||
|
||
from DeepCrossing import DeepCrossing
|
||
from DeepFM import DeepFM
|
||
from NFM import NFM
|
||
from WideNDeep import WideNDeep
|
||
from DIN import DIN
|
||
from NCF import NCF
|
||
from AFM import AFM
|
||
from DCN import DCN
|
||
from PNN import PNN
|
||
from DIEN import DIEN
|
||
|
||
from utils import DenseFeat, SparseFeat, VarLenSparseFeat
|
||
|
||
# 简单处理特征,包括填充缺失值,数值处理,类别编码
|
||
def data_process(data_df, dense_features, sparse_features):
|
||
data_df[dense_features] = data_df[dense_features].fillna(0.0)
|
||
for f in dense_features:
|
||
data_df[f] = data_df[f].apply(lambda x: np.log(x+1) if x > -1 else -1)
|
||
|
||
data_df[sparse_features] = data_df[sparse_features].fillna("-1")
|
||
for f in sparse_features:
|
||
lbe = LabelEncoder()
|
||
data_df[f] = lbe.fit_transform(data_df[f])
|
||
|
||
return data_df[dense_features + sparse_features]
|
||
|
||
|
||
# 读取criteo数据
|
||
def read_criteo_data():
|
||
# 读取数据
|
||
data = pd.read_csv('./data/criteo_sample.txt')
|
||
|
||
# 划分dense和sparse特征
|
||
columns = data.columns.values
|
||
dense_features = [feat for feat in columns if 'I' in feat]
|
||
sparse_features = [feat for feat in columns if 'C' in feat]
|
||
|
||
return data, dense_features, sparse_features
|
||
|
||
|
||
def plot_deepcrossing():
|
||
data, dense_features, sparse_features = read_criteo_data()
|
||
dense_features = dense_features[:3]
|
||
sparse_features = sparse_features[:3]
|
||
|
||
# 将特征分组,分成linear部分和dnn部分(根据实际场景进行选择),并将分组之后的特征做标记(使用DenseFeat, SparseFeat)
|
||
dnn_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
|
||
for feat in sparse_features] + [DenseFeat(feat, 1,)
|
||
for feat in dense_features]
|
||
|
||
# 构建DeepCrossing模型
|
||
history = DeepCrossing(dnn_feature_columns)
|
||
keras.utils.plot_model(history, to_file="./imgs/DeepCrossing.png", show_shapes=True)
|
||
|
||
|
||
def plot_deepfm():
|
||
# 读取数据
|
||
data, dense_features, sparse_features = read_criteo_data()
|
||
dense_features = dense_features[:3]
|
||
sparse_features = sparse_features[:2]
|
||
|
||
# 将特征分组,分成linear部分和dnn部分(根据实际场景进行选择),并将分组之后的特征做标记(使用DenseFeat, SparseFeat)
|
||
linear_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
|
||
for feat in sparse_features] + [DenseFeat(feat, 1,)
|
||
for feat in dense_features]
|
||
|
||
dnn_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
|
||
for feat in sparse_features] + [DenseFeat(feat, 1,)
|
||
for feat in dense_features]
|
||
|
||
# 构建DeepFM模型
|
||
history = DeepFM(linear_feature_columns, dnn_feature_columns)
|
||
keras.utils.plot_model(history, to_file="./imgs/DeepFM.png", show_shapes=True)
|
||
|
||
|
||
def plot_nfm():
|
||
# 读取数据
|
||
data, dense_features, sparse_features = read_criteo_data()
|
||
dense_features = dense_features[:3]
|
||
sparse_features = sparse_features[:2]
|
||
|
||
# 将特征分组,分成linear部分和dnn部分(根据实际场景进行选择),并将分组之后的特征做标记(使用DenseFeat, SparseFeat)
|
||
linear_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
|
||
for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
|
||
for feat in dense_features]
|
||
|
||
dnn_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
|
||
for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
|
||
for feat in dense_features]
|
||
|
||
# 构建NFM模型
|
||
history = NFM(linear_feature_columns, dnn_feature_columns)
|
||
keras.utils.plot_model(history, to_file="./imgs/NFM.png", show_shapes=True)
|
||
|
||
|
||
def plot_widendeep():
|
||
# 读取数据
|
||
data, dense_features, sparse_features = read_criteo_data()
|
||
dense_features = dense_features[:3]
|
||
sparse_features = sparse_features[:2]
|
||
|
||
# 将特征分组,分成linear部分和dnn部分(根据实际场景进行选择),并将分组之后的特征做标记(使用DenseFeat, SparseFeat)
|
||
linear_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
|
||
for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
|
||
for feat in dense_features]
|
||
|
||
dnn_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
|
||
for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
|
||
for feat in dense_features]
|
||
|
||
# 构建WideNDeep模型
|
||
history = WideNDeep(linear_feature_columns, dnn_feature_columns)
|
||
keras.utils.plot_model(history, to_file="./imgs/Wide&Deep.png", show_shapes=True)
|
||
|
||
|
||
def plot_din():
|
||
# 读取数据
|
||
samples_data = pd.read_csv("./data/movie_sample.txt", sep="\t", header = None)
|
||
samples_data.columns = ["user_id", "gender", "age", "hist_movie_id", "hist_len", "movie_id", "movie_type_id", "label"]
|
||
|
||
feature_columns = [SparseFeat('user_id', max(samples_data["user_id"])+1, embedding_dim=8),
|
||
SparseFeat('gender', max(samples_data["gender"])+1, embedding_dim=8),
|
||
SparseFeat('age', max(samples_data["age"])+1, embedding_dim=8),
|
||
SparseFeat('movie_id', max(samples_data["movie_id"])+1, embedding_dim=8),
|
||
SparseFeat('movie_type_id', max(samples_data["movie_type_id"])+1, embedding_dim=8),
|
||
DenseFeat('hist_len', 1)]
|
||
|
||
feature_columns += [VarLenSparseFeat('hist_movie_id', vocabulary_size=max(samples_data["movie_id"])+1, embedding_dim=8, maxlen=50)]
|
||
|
||
# 行为特征列表,表示的是基础特征
|
||
behavior_feature_list = ['movie_id']
|
||
# 行为序列特征
|
||
behavior_seq_feature_list = ['hist_movie_id']
|
||
|
||
history = DIN(feature_columns, behavior_feature_list, behavior_seq_feature_list)
|
||
keras.utils.plot_model(history, to_file="./imgs/DIN.png", show_shapes=True)
|
||
|
||
|
||
def plot_pnn():
|
||
data, dense_features, sparse_features = read_criteo_data()
|
||
dense_features = dense_features[:3]
|
||
sparse_features = sparse_features[:3]
|
||
|
||
# 将特征分组,分成linear部分和dnn部分(根据实际场景进行选择),并将分组之后的特征做标记(使用DenseFeat, SparseFeat)
|
||
dnn_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
|
||
for feat in sparse_features] + [DenseFeat(feat, 1,)
|
||
for feat in dense_features]
|
||
|
||
# 构建DeepCrossing模型
|
||
history = PNN(dnn_feature_columns)
|
||
keras.utils.plot_model(history, to_file="./imgs/PNN.png", show_shapes=True)
|
||
|
||
|
||
def plot_ncf():
|
||
# 读取数据,NCF使用的特征只有user_id和item_id
|
||
rnames = ['user_id','movie_id','rating','timestamp']
|
||
data = pd.read_csv('./data/ml-1m/ratings.dat', sep='::', engine='python', names=rnames)
|
||
|
||
lbe = LabelEncoder()
|
||
data['user_id'] = lbe.fit_transform(data['user_id'])
|
||
data['movie_id'] = lbe.fit_transform(data['movie_id'])
|
||
|
||
dnn_feature_columns = [SparseFeat('user_id', data['user_id'].nunique(), 8),
|
||
SparseFeat('movie_id', data['movie_id'].nunique(), 8)]
|
||
|
||
# 构建FM模型
|
||
history = NCF(dnn_feature_columns)
|
||
keras.utils.plot_model(history, to_file="./imgs/NCF.png", show_shapes=True)
|
||
|
||
|
||
def plot_dcn():
|
||
# 读取数据
|
||
data, dense_features, sparse_features = read_criteo_data()
|
||
dense_features = dense_features[:3]
|
||
sparse_features = sparse_features[:2]
|
||
|
||
# 将特征分组,分成linear部分和dnn部分(根据实际场景进行选择),并将分组之后的特征做标记(使用DenseFeat, SparseFeat)
|
||
linear_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
|
||
for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
|
||
for feat in dense_features]
|
||
|
||
dnn_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
|
||
for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
|
||
for feat in dense_features]
|
||
|
||
# 构建AFM模型
|
||
history = DCN(linear_feature_columns, dnn_feature_columns)
|
||
keras.utils.plot_model(history, to_file="./imgs/DCN.png", show_shapes=True)
|
||
|
||
|
||
def plot_afm():
|
||
# 读取数据
|
||
data, dense_features, sparse_features = read_criteo_data()
|
||
dense_features = dense_features[:3]
|
||
sparse_features = sparse_features[:2]
|
||
|
||
# 将特征分组,分成linear部分和dnn部分(根据实际场景进行选择),并将分组之后的特征做标记(使用DenseFeat, SparseFeat)
|
||
linear_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
|
||
for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
|
||
for feat in dense_features]
|
||
|
||
dnn_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
|
||
for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
|
||
for feat in dense_features]
|
||
|
||
# 构建AFM模型
|
||
history = AFM(linear_feature_columns, dnn_feature_columns)
|
||
keras.utils.plot_model(history, to_file="./imgs/AFM.png", show_shapes=True)
|
||
|
||
|
||
def plot_dien():
|
||
"""读取数据"""
|
||
samples_data = pd.read_csv("data/movie_sample.txt", sep="\t", header = None)
|
||
samples_data.columns = ["user_id", "gender", "age", "hist_movie_id", "hist_len", "movie_id", "movie_type_id", "label"]
|
||
|
||
"""数据集"""
|
||
X = samples_data[["user_id", "gender", "age", "hist_movie_id", "hist_len", "movie_id", "movie_type_id"]]
|
||
y = samples_data["label"]
|
||
|
||
"""特征封装"""
|
||
feature_columns = [SparseFeat('user_id', max(samples_data["user_id"])+1, embedding_dim=8),
|
||
SparseFeat('gender', max(samples_data["gender"])+1, embedding_dim=8),
|
||
SparseFeat('age', max(samples_data["age"])+1, embedding_dim=8),
|
||
SparseFeat('movie_id', max(samples_data["movie_id"])+1, embedding_dim=8),
|
||
SparseFeat('movie_type_id', max(samples_data["movie_type_id"])+1, embedding_dim=8),
|
||
DenseFeat('hist_len', 1)]
|
||
|
||
feature_columns += [VarLenSparseFeat('hist_movie_id', vocabulary_size=max(samples_data["movie_id"])+1, embedding_dim=8, maxlen=50)]
|
||
feature_columns += [VarLenSparseFeat('neg_hist_movie_id', vocabulary_size=max(samples_data["movie_id"])+1, embedding_dim=8, maxlen=50)]
|
||
|
||
# 行为特征列表,表示的是基础特征
|
||
behavior_feature_list = ['movie_id']
|
||
# 行为序列特征
|
||
behavior_seq_feature_list = ['hist_movie_id']
|
||
# 负采样序列特征
|
||
neg_seq_feature_list = ['neg_hist_movie_id']
|
||
|
||
"""构建DIN模型"""
|
||
history = DIEN(feature_columns, behavior_feature_list, behavior_seq_feature_list, neg_seq_feature_list, use_neg_sample=True)
|
||
|
||
keras.utils.plot_model(history, to_file="./imgs/DIEN.png", show_shapes=True)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
# plot_deepcrossing()
|
||
# plot_deepfm()
|
||
# plot_nfm()
|
||
# plot_widendeep()
|
||
# plot_din()
|
||
# plot_ncf()
|
||
# plot_afm()
|
||
# plot_dcn()
|
||
# plot_pnn()
|
||
plot_dien()
|
||
|