Files
fun-rec/codes/base_models/PlotModels.py
2021-12-04 10:58:42 +08:00

262 lines
12 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from collections import namedtuple
from tensorflow import keras
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from DeepCrossing import DeepCrossing
from DeepFM import DeepFM
from NFM import NFM
from WideNDeep import WideNDeep
from DIN import DIN
from NCF import NCF
from AFM import AFM
from DCN import DCN
from PNN import PNN
from DIEN import DIEN
from utils import DenseFeat, SparseFeat, VarLenSparseFeat
# 简单处理特征,包括填充缺失值,数值处理,类别编码
def data_process(data_df, dense_features, sparse_features):
data_df[dense_features] = data_df[dense_features].fillna(0.0)
for f in dense_features:
data_df[f] = data_df[f].apply(lambda x: np.log(x+1) if x > -1 else -1)
data_df[sparse_features] = data_df[sparse_features].fillna("-1")
for f in sparse_features:
lbe = LabelEncoder()
data_df[f] = lbe.fit_transform(data_df[f])
return data_df[dense_features + sparse_features]
# 读取criteo数据
def read_criteo_data():
# 读取数据
data = pd.read_csv('./data/criteo_sample.txt')
# 划分dense和sparse特征
columns = data.columns.values
dense_features = [feat for feat in columns if 'I' in feat]
sparse_features = [feat for feat in columns if 'C' in feat]
return data, dense_features, sparse_features
def plot_deepcrossing():
data, dense_features, sparse_features = read_criteo_data()
dense_features = dense_features[:3]
sparse_features = sparse_features[:3]
# 将特征分组分成linear部分和dnn部分(根据实际场景进行选择)并将分组之后的特征做标记使用DenseFeat, SparseFeat
dnn_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
for feat in sparse_features] + [DenseFeat(feat, 1,)
for feat in dense_features]
# 构建DeepCrossing模型
history = DeepCrossing(dnn_feature_columns)
keras.utils.plot_model(history, to_file="./imgs/DeepCrossing.png", show_shapes=True)
def plot_deepfm():
# 读取数据
data, dense_features, sparse_features = read_criteo_data()
dense_features = dense_features[:3]
sparse_features = sparse_features[:2]
# 将特征分组分成linear部分和dnn部分(根据实际场景进行选择)并将分组之后的特征做标记使用DenseFeat, SparseFeat
linear_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
for feat in sparse_features] + [DenseFeat(feat, 1,)
for feat in dense_features]
dnn_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
for feat in sparse_features] + [DenseFeat(feat, 1,)
for feat in dense_features]
# 构建DeepFM模型
history = DeepFM(linear_feature_columns, dnn_feature_columns)
keras.utils.plot_model(history, to_file="./imgs/DeepFM.png", show_shapes=True)
def plot_nfm():
# 读取数据
data, dense_features, sparse_features = read_criteo_data()
dense_features = dense_features[:3]
sparse_features = sparse_features[:2]
# 将特征分组分成linear部分和dnn部分(根据实际场景进行选择)并将分组之后的特征做标记使用DenseFeat, SparseFeat
linear_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
for feat in dense_features]
dnn_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
for feat in dense_features]
# 构建NFM模型
history = NFM(linear_feature_columns, dnn_feature_columns)
keras.utils.plot_model(history, to_file="./imgs/NFM.png", show_shapes=True)
def plot_widendeep():
# 读取数据
data, dense_features, sparse_features = read_criteo_data()
dense_features = dense_features[:3]
sparse_features = sparse_features[:2]
# 将特征分组分成linear部分和dnn部分(根据实际场景进行选择)并将分组之后的特征做标记使用DenseFeat, SparseFeat
linear_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
for feat in dense_features]
dnn_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
for feat in dense_features]
# 构建WideNDeep模型
history = WideNDeep(linear_feature_columns, dnn_feature_columns)
keras.utils.plot_model(history, to_file="./imgs/Wide&Deep.png", show_shapes=True)
def plot_din():
# 读取数据
samples_data = pd.read_csv("./data/movie_sample.txt", sep="\t", header = None)
samples_data.columns = ["user_id", "gender", "age", "hist_movie_id", "hist_len", "movie_id", "movie_type_id", "label"]
feature_columns = [SparseFeat('user_id', max(samples_data["user_id"])+1, embedding_dim=8),
SparseFeat('gender', max(samples_data["gender"])+1, embedding_dim=8),
SparseFeat('age', max(samples_data["age"])+1, embedding_dim=8),
SparseFeat('movie_id', max(samples_data["movie_id"])+1, embedding_dim=8),
SparseFeat('movie_type_id', max(samples_data["movie_type_id"])+1, embedding_dim=8),
DenseFeat('hist_len', 1)]
feature_columns += [VarLenSparseFeat('hist_movie_id', vocabulary_size=max(samples_data["movie_id"])+1, embedding_dim=8, maxlen=50)]
# 行为特征列表,表示的是基础特征
behavior_feature_list = ['movie_id']
# 行为序列特征
behavior_seq_feature_list = ['hist_movie_id']
history = DIN(feature_columns, behavior_feature_list, behavior_seq_feature_list)
keras.utils.plot_model(history, to_file="./imgs/DIN.png", show_shapes=True)
def plot_pnn():
data, dense_features, sparse_features = read_criteo_data()
dense_features = dense_features[:3]
sparse_features = sparse_features[:3]
# 将特征分组分成linear部分和dnn部分(根据实际场景进行选择)并将分组之后的特征做标记使用DenseFeat, SparseFeat
dnn_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
for feat in sparse_features] + [DenseFeat(feat, 1,)
for feat in dense_features]
# 构建DeepCrossing模型
history = PNN(dnn_feature_columns)
keras.utils.plot_model(history, to_file="./imgs/PNN.png", show_shapes=True)
def plot_ncf():
# 读取数据NCF使用的特征只有user_id和item_id
rnames = ['user_id','movie_id','rating','timestamp']
data = pd.read_csv('./data/ml-1m/ratings.dat', sep='::', engine='python', names=rnames)
lbe = LabelEncoder()
data['user_id'] = lbe.fit_transform(data['user_id'])
data['movie_id'] = lbe.fit_transform(data['movie_id'])
dnn_feature_columns = [SparseFeat('user_id', data['user_id'].nunique(), 8),
SparseFeat('movie_id', data['movie_id'].nunique(), 8)]
# 构建FM模型
history = NCF(dnn_feature_columns)
keras.utils.plot_model(history, to_file="./imgs/NCF.png", show_shapes=True)
def plot_dcn():
# 读取数据
data, dense_features, sparse_features = read_criteo_data()
dense_features = dense_features[:3]
sparse_features = sparse_features[:2]
# 将特征分组分成linear部分和dnn部分(根据实际场景进行选择)并将分组之后的特征做标记使用DenseFeat, SparseFeat
linear_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
for feat in dense_features]
dnn_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
for feat in dense_features]
# 构建AFM模型
history = DCN(linear_feature_columns, dnn_feature_columns)
keras.utils.plot_model(history, to_file="./imgs/DCN.png", show_shapes=True)
def plot_afm():
# 读取数据
data, dense_features, sparse_features = read_criteo_data()
dense_features = dense_features[:3]
sparse_features = sparse_features[:2]
# 将特征分组分成linear部分和dnn部分(根据实际场景进行选择)并将分组之后的特征做标记使用DenseFeat, SparseFeat
linear_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
for feat in dense_features]
dnn_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
for feat in dense_features]
# 构建AFM模型
history = AFM(linear_feature_columns, dnn_feature_columns)
keras.utils.plot_model(history, to_file="./imgs/AFM.png", show_shapes=True)
def plot_dien():
"""读取数据"""
samples_data = pd.read_csv("data/movie_sample.txt", sep="\t", header = None)
samples_data.columns = ["user_id", "gender", "age", "hist_movie_id", "hist_len", "movie_id", "movie_type_id", "label"]
"""数据集"""
X = samples_data[["user_id", "gender", "age", "hist_movie_id", "hist_len", "movie_id", "movie_type_id"]]
y = samples_data["label"]
"""特征封装"""
feature_columns = [SparseFeat('user_id', max(samples_data["user_id"])+1, embedding_dim=8),
SparseFeat('gender', max(samples_data["gender"])+1, embedding_dim=8),
SparseFeat('age', max(samples_data["age"])+1, embedding_dim=8),
SparseFeat('movie_id', max(samples_data["movie_id"])+1, embedding_dim=8),
SparseFeat('movie_type_id', max(samples_data["movie_type_id"])+1, embedding_dim=8),
DenseFeat('hist_len', 1)]
feature_columns += [VarLenSparseFeat('hist_movie_id', vocabulary_size=max(samples_data["movie_id"])+1, embedding_dim=8, maxlen=50)]
feature_columns += [VarLenSparseFeat('neg_hist_movie_id', vocabulary_size=max(samples_data["movie_id"])+1, embedding_dim=8, maxlen=50)]
# 行为特征列表,表示的是基础特征
behavior_feature_list = ['movie_id']
# 行为序列特征
behavior_seq_feature_list = ['hist_movie_id']
# 负采样序列特征
neg_seq_feature_list = ['neg_hist_movie_id']
"""构建DIN模型"""
history = DIEN(feature_columns, behavior_feature_list, behavior_seq_feature_list, neg_seq_feature_list, use_neg_sample=True)
keras.utils.plot_model(history, to_file="./imgs/DIEN.png", show_shapes=True)
if __name__ == '__main__':
# plot_deepcrossing()
# plot_deepfm()
# plot_nfm()
# plot_widendeep()
# plot_din()
# plot_ncf()
# plot_afm()
# plot_dcn()
# plot_pnn()
plot_dien()