Merge pull request #48 from Lyons-T/master

Add files via upload
This commit is contained in:
若如意
2022-02-27 09:53:38 +08:00
committed by GitHub
40 changed files with 1495 additions and 0 deletions

View File

@@ -0,0 +1,53 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @File : create_ctr_data.py
# @Author: xLyons
# @IDE : PyCharm
# @Time : 2022/2/7
import pickle
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from model_tools.feature_columns import SparseFeat, DenseFeat
def create_ctr_data(data_path, args, use_dict=True):
with open(data_path + 'data.pkl', 'rb') as f:
all_data, feature_info = pickle.load(f)
f.close()
# 训练数据和测试数据
all_data = shuffle(all_data)
train_df = all_data[all_data['是否点击'] != -1]
test_df = all_data[all_data['是否点击'] == -1]
# 测试数据的标签
test_labels = pd.read_pickle(data_path + 'test_label.pkl')
test_labels = pd.merge(test_df[['index']], test_labels, how='left', on=['index'])
all_features = feature_info['dense_features'] + feature_info['sparse_features']
if use_dict:
train_inputs = {name: np.array(train_df[name].tolist()) for name in all_features}
train_labels = train_df['是否点击'].values
test_inputs = {name: np.array(test_df[name].tolist()) for name in all_features}
test_labels = test_labels['是否点击'].values
else:
train_inputs = [np.array(train_df[name]) for name in all_features]
train_labels = train_df['是否点击'].values
test_inputs = [np.array(test_df[name]) for name in all_features]
test_labels = test_labels['是否点击'].values
features_columns = [DenseFeat(name=feat,
dimension=1,
dtype='float32',)
for feat in feature_info['dense_features']]
features_columns += [SparseFeat(name=feat,
embed_name=feat,
embed_dim=args.embed_dim,
vocab_size=all_data[feat].max()+1,
dtype='int32',)
for feat in feature_info['sparse_features']]
return (train_inputs, train_labels), (test_inputs, test_labels), features_columns

View File

@@ -0,0 +1,163 @@
#!/usrbin/env python
# -*- coding:utf-8 -*-
# @File : news_data_process.py
# @Author: xLyons
# @IDE : PyCharm
# @Time : 2022/2/7
import os
import gc
import swifter
import pickle
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder, StandardScaler
from utils.data_compression import reduce_mem
def get_statistical_features(all_data, past_day=7):
# 统计新闻从发文到展示的日期差
temp = all_data['展现日期'] - all_data['发文日期']
all_data['从发文到展现的日期差'] = temp.dt.days
all_data.loc[all_data['从发文到展现的日期差'] < 0, '从发文到展现的日期差'] = 0
all_data.fillna(value={'从发文到展现的日期差': 0}, inplace=True)
statis_dense_columns = ['从发文到展现的日期差']
dates = all_data['展现日期'].unique()
dates.sort()
date_num = len(dates)
date_map = dict(zip(dates, range(date_num)))
all_data['展现日期_idx'] = all_data['展现日期'].map(date_map)
train_data = all_data[all_data['是否点击'] != -1]
# ===================================================================================
for feat in tqdm([['user_id'], ['item_id'], ['一级分类'], ['二级分类'],
['user_id', '一级分类'], ['user_id', '二级分类']]):
res_arr = []
name = f'过去{past_day}天_特征({"_".join(feat)})_展现总数'
statis_dense_columns.append(name)
for day in range(0, date_num):
train_data_temp = train_data[
(train_data['展现日期_idx'] >= day-past_day) & (train_data['展现日期_idx'] < day)]
train_data_temp = train_data_temp.groupby(feat)['item_id'].agg([
(name, 'count')]).reset_index()
train_data_temp['展现日期_idx'] = day
res_arr.append(train_data_temp)
stat_all_data = pd.concat(res_arr)
all_data = all_data.merge(stat_all_data, how='left', on=feat + ['展现日期_idx'])
target = '是否点击'
for feat in tqdm([['user_id'], ['item_id'], ['一级分类'], ['二级分类'],
['user_id', '一级分类'], ['user_id', '二级分类']]):
res_arr = []
name_mean = f'过去{past_day}天_特征({"_".join(feat)})_点击率mean'
name_sum = f'过去{past_day}天_特征({"_".join(feat)})_点击总数sum'
statis_dense_columns.append(name_mean)
statis_dense_columns.append(name_sum)
for day in range(0, date_num):
train_data_temp = train_data[
(train_data['展现日期_idx'] >= day-past_day) & (train_data['展现日期_idx'] < day)]
train_data_temp = train_data_temp.groupby(feat)[target].agg(
[(name_mean, 'mean'), (name_sum, 'sum')]).reset_index()
train_data_temp['展现日期_idx'] = day
res_arr.append(train_data_temp)
stat_all_data = pd.concat(res_arr)
all_data = all_data.merge(stat_all_data, how='left', on=feat + ['展现日期_idx'])
target = '消费时长(秒)'
for feat in tqdm([['user_id'], ['item_id'], ['一级分类'], ['二级分类'],
['user_id', '一级分类'], ['user_id', '二级分类']]):
res_arr = []
name_mean = f'过去{past_day}天_特征({"_".join(feat)})_消费时长mean'
name_std = f'过去{past_day}天_特征({"_".join(feat)})_消费时长std'
name_sum = f'过去{past_day}天_特征({"_".join(feat)})_消费时长sum'
statis_dense_columns.append(name_mean)
statis_dense_columns.append(name_std)
statis_dense_columns.append(name_sum)
for day in range(0, date_num):
train_data_temp = train_data[
(train_data['展现日期_idx'] >= day-past_day) & (train_data['展现日期_idx'] < day)]
train_data_temp = train_data_temp.groupby(feat)[target].agg(
[(name_mean, 'mean'), (name_std, 'std'), (name_sum, 'sum')]
).reset_index()
train_data_temp['展现日期_idx'] = day
res_arr.append(train_data_temp)
stat_all_data = pd.concat(res_arr)
all_data = all_data.merge(stat_all_data, how='left', on=feat + ['展现日期_idx'])
return all_data, statis_dense_columns
def main():
raw_data_path = '../raw_data'
new_data_path = '../new_data'
os.makedirs(new_data_path, exist_ok=True)
train_data_path = os.path.join(raw_data_path, 'train_data.pkl')
test_data_path = os.path.join(raw_data_path, 'test_data.pkl')
train_data = pd.read_pickle(train_data_path)
test_data = pd.read_pickle(test_data_path)
test_data['是否点击'] = -1
all_data = pd.concat([train_data, test_data])
# 1. 合并用户特征
user_path = os.path.join(new_data_path, 'user_info_5w.pkl')
user_info = pd.read_pickle(user_path)
all_data = all_data.merge(
user_info[['user_id', '设备名称', '操作系统', '所在省', '所在市', '年龄', '性别']],
how='left', on='user_id'
)
del user_info
gc.collect()
# 2. 合并文档特征
doc_path = os.path.join(new_data_path, 'doc_info.pkl')
doc_info = pd.read_pickle(doc_path)
all_data = all_data.merge(
doc_info[['item_id', '一级分类', '二级分类', '关键词', '图片数量', '发文时间', '发文日期']],
how='left', on='item_id'
)
del doc_info
gc.collect()
# 3. 获取统计特征
all_data, statis_dense_columns = get_statistical_features(all_data)
# 4. 连续特征处理
base_dense_columns = ['刷新次数', '图片数量']
dense_columns = base_dense_columns + statis_dense_columns
all_data.fillna(value={feat: 0 for feat in dense_columns}, inplace=True)
# sc = StandardScaler()
# all_data[dense_columns] = sc.fit_transform(all_data[dense_columns])
for feat in dense_columns:
all_data[feat] = np.log(1 + all_data[feat])
# 5. 离散特征处理
sparse_columns = ['user_id', 'item_id', '网路环境', '设备名称', '操作系统', '展现位置',
'所在省', '所在市', '年龄', '性别', '一级分类', '二级分类', '关键词']
for feat in sparse_columns:
lb = LabelEncoder()
all_data[feat] = lb.fit_transform(all_data[feat].astype(str))
all_data = reduce_mem(all_data)
feature_info = {'dense_features': dense_columns,
'sparse_features': sparse_columns}
file = [all_data, feature_info]
file_save_path = os.path.join(new_data_path, 'data.pkl')
with open(file_save_path, 'wb') as f:
pickle.dump(file, f)
f.close()
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,46 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @File : train&test_data_split.py
# @Author: xLyons
# @IDE : PyCharm
# @Time : 2022/2/16
import os
import pandas as pd
def main():
raw_data_path = '../raw_data'
new_data_path = '../new_data'
# 1. 数据读取
all_data_path = os.path.join(raw_data_path, 'train_data_5w.csv')
all_data = pd.read_csv(all_data_path, sep='\t', index_col=0) # .sample(n=100000)
all_data.columns = ['user_id', 'item_id', '展现时间', '网路环境', '刷新次数', '展现位置', '是否点击', '消费时长(秒)']
print(f'样本总数为:{all_data.shape[0]}')
# 2. 数据处理
all_data.loc[all_data['消费时长(秒)'] < 0, '消费时长(秒)'] = 0
all_data['展现时间'] = pd.to_datetime(
all_data.loc[:, '展现时间'], utc=True, unit='ms').dt.tz_convert('Asia/Shanghai')
all_data['展现日期'] = all_data['展现时间'].dt.date
all_data['index'] = range(all_data.shape[0])
dates = all_data['展现日期'].unique()
dates.sort()
# 3. 训练、测试数据集划分
train_data = all_data[all_data['展现日期'] != dates[-1]]
test_data = all_data[all_data['展现日期'] == dates[-1]]
test_label = test_data[['index', '是否点击']]
# 4. 测试集处理
test_data = test_data.drop(columns=['消费时长(秒)', '展现位置', '是否点击'])
train_data.to_pickle(os.path.join(raw_data_path, 'train_data.pkl'))
test_data.to_pickle(os.path.join(raw_data_path, 'test_data.pkl'))
test_label.to_pickle(os.path.join(new_data_path, 'test_label.pkl'))
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,103 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @File : user&doc_data_process.py
# @Author: xLyons
# @IDE : PyCharm
# @Time : 2022/2/9
import os
import swifter
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
def prob2val(feat_info):
# 判断是否为空
if feat_info == feat_info:
prob_list = [values.split(':') for values in feat_info.split(',')]
prob_list = sorted(prob_list, key=lambda x: float(x[1]))
return prob_list[-1][0]
else:
return np.NaN
def get_second_title(x):
if x['二级分类'] == x['二级分类']:
second_titles = x['二级分类'].split('/')
for title in second_titles:
# 跳过异常数据
if title == 'A_0_24:0.447656,A_25_29:0.243809,A_30_39:0.076268,A_40+:0.232267':
continue
# 优先返回不等于一级分类的二级分类
if title != x['一级分类']:
return title
return x['一级分类']
def get_key_word(feat_info):
if feat_info == feat_info and isinstance(feat_info, str):
key_word_list = [values.split(':') for values in feat_info.replace('^', '').split(',')]
new_list = []
last_elem = ''
for idx, values in enumerate(key_word_list):
if len(values) == 1:
last_elem = values[0] if last_elem == '' else ','.join([last_elem, values[0]])
continue
if len(values) > 2:
# 将类似于‘你好,李焕英’这种关键词重新进行拼接
# 这类关键词由于存在逗号在获取key_word_list时被误分开了
values[0] = ':'.join(values[:-1])
values[0] = values[0] if last_elem == '' else ','.join([last_elem, values[0]])
new_list.append(values)
last_elem = ''
return new_list[-1][0]
else:
return np.NaN
def main():
raw_data_path = '../raw_data'
new_data_path = '../new_data'
os.makedirs(new_data_path, exist_ok=True)
# 1. 处理用户文件
user_path = os.path.join(raw_data_path, 'user_info_5w.csv')
user_info = pd.read_csv(user_path, sep='\t', index_col=0)
user_info.columns = ['user_id', '设备名称', '操作系统', '所在省', '所在市', '年龄', '性别']
user_info['年龄'] = [prob2val(age_info) for age_info in tqdm(user_info['年龄'])]
user_info['性别'] = [prob2val(sex_info) for sex_info in tqdm(user_info['性别'])]
user_info.to_pickle(os.path.join(new_data_path, 'user_info_5w.pkl'))
# 2. 处理文档文件
doc_path = os.path.join(raw_data_path, 'doc_info.txt')
doc_info = pd.read_table(doc_path, sep='\t', low_memory=False, header=None)
doc_info.columns = ['item_id', '标题', '发文时间', '图片数量', '一级分类', '二级分类', '关键词']
# 处理异常的发文时间数据
condition_row = (doc_info['发文时间'].isnull()) | (doc_info['发文时间'] == 'Android')
time_fill_value = doc_info.loc[~condition_row, '发文时间'].swifter.apply(lambda x: int(x[:10])).astype('int').min()
doc_info.loc[condition_row, '发文时间'] = str(time_fill_value)
doc_info['发文时间'] = pd.to_datetime(
doc_info.loc[:, '发文时间'], utc=True, unit='ms').dt.tz_convert('Asia/Shanghai')
doc_info['发文日期'] = doc_info['发文时间'].dt.date
doc_info['图片数量'] = doc_info.loc[:, '图片数量'].swifter.apply(
lambda x: 0 if (x in ['上海', '云南', '山东'] or x != x) else int(x))
doc_info['二级分类'] = doc_info.loc[:, ['一级分类', '二级分类']].swifter.apply(get_second_title, axis=1)
doc_info['关键词'] = [get_key_word(words) for words in tqdm(doc_info['关键词'])]
doc_info.to_pickle(os.path.join(new_data_path, 'doc_info.pkl'))
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,13 @@
原始数据集共包含3个实验时存放在目录`rank/examples/dataset/raw_data/`下。
+ **user_info_5w.csv**
+ 该文件共包含了5万条用户的个人数据
+ 特征分别包括了:['user_id', 'device', 'os', 'province', 'city', 'age', 'gender']
+ 各特征的含义为:['用户id', '设备名称', '操作系统', '所在省', '所在市', '年龄', '性别']
+ **doc_info.txt**
+ 该文件包含了所有新闻的特征数据;
+ 各特征的含义为:['文档id', '标题', '发文时间', '图片数量', '一级分类', '二级分类', '关键词']
+ **train_data_5w.csv**
+ 该文件为用户点击数据包含了5万个用户在过去13天的点击数据
+ 各特征的含义为:['用户id', '文档id', '展现时间', '网路环境', '刷新次数', '展现位置', '是否点击', '消费时长(秒)']

View File

@@ -0,0 +1,29 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @File : deepfm_news.py
# @Author: xLyons
# @IDE : PyCharm
# @Time : 2022/1/27
import argparse
from run_train import run_deepfm
from utils.set_parament import get_args
from dataset.data_process.create_ctr_data import create_ctr_data
parser = argparse.ArgumentParser(description='Model Parameter')
parser.add_argument('--yaml_path',
default='./set_para/deepfm_news.yaml',
required=False)
parser.add_argument('--data_path',
default='./dataset/new_data/',
required=False)
parse_args = parser.parse_args()
if __name__ == '__main__':
args = get_args(parse_args.yaml_path)
train_data, test_data, feature_info = create_ctr_data(parse_args.data_path, args)
run_deepfm.run(train_data, test_data, feature_info, args)

View File

@@ -0,0 +1,29 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @File : deepfm_ppnet_news.py
# @Author: xLyons
# @IDE : PyCharm
# @Time : 2022/2/8
import argparse
from run_train import run_deepfm_ppnet
from utils.set_parament import get_args
from dataset.data_process.create_ctr_data import create_ctr_data
parser = argparse.ArgumentParser(description='Model Parameter')
parser.add_argument('--yaml_path',
default='./set_para/deepfm_ppnet_news.yaml',
required=False)
parser.add_argument('--data_path',
default='./dataset/new_data/',
required=False)
parse_args = parser.parse_args()
if __name__ == '__main__':
args = get_args(parse_args.yaml_path)
train_data, test_data, feature_info = create_ctr_data(parse_args.data_path, args)
run_deepfm_ppnet.run(train_data, test_data, feature_info, args)

View File

@@ -0,0 +1,14 @@
# data para
seed: 48
# model para
embed_dim: 32
drop_rate: 0.5
use_bn: Ture
hidden_units: [64, 128, 64]
# compile para
learning_rate: 0.001
epochs: 1
batch_size: 2048
val_splite: 0.1
patience: 5
restore_best_weights: True

View File

@@ -0,0 +1,16 @@
# data para
seed: 48
# model para
embed_dim: 32
drop_rate: 0.5
ppnet_size: 256
ppnet_features: ['user_id', '一级分类', '年龄']
use_bn: Ture
hidden_units: [64, 128, 64]
# compile para
learning_rate: 0.001
epochs: 1
batch_size: 2048
val_splite: 0.1
patience: 5
restore_best_weights: True

View File

@@ -0,0 +1,6 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @File : __init__.py.py
# @Author: xLyons
# @IDE : PyCharm
# @Time : 2022/1/27

View File

@@ -0,0 +1,24 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @File : activation.py
# @Author: xLyons
# @IDE : PyCharm
# @Time : 2022/1/27
import tensorflow as tf
from tensorflow.keras.initializers import Zeros
from tensorflow.keras.layers import Layer
unicode = str
def activation_layer(activation):
if isinstance(activation, (str, unicode)):
act_layer = tf.keras.layers.Activation(activation)
elif issubclass(activation, Layer):
act_layer = activation()
else:
raise ValueError(
"Invalid activation,found %s.You should use a str or a Activation Layer Class." % (activation))
return act_layer

View File

@@ -0,0 +1,238 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File : core.py
# @Author: xLyons
# @IDE PyCharm
# @Time : 2022/1/27
import tensorflow as tf
from tensorflow.keras.layers import Layer, Dense, Embedding, BatchNormalization, Dropout
from tensorflow.keras.initializers import glorot_normal, Zeros, glorot_uniform
from tensorflow.keras.regularizers import l2
from layers.activation import activation_layer
class DNN(Layer):
def __init__(self, hidden_units, activation='relu', l2_reg=.0, dropout_rate=.0, use_bn=False,
output_activation=None, seed=48, **kwargs):
self.hidden_units = hidden_units
self.activation = activation
self.l2_reg = l2_reg
self.dropout_rate = dropout_rate
self.use_bn = use_bn
self.output_activation = output_activation
self.seed = seed
super(DNN, self).__init__(**kwargs)
def build(self, input_shape):
input_size = input_shape[-1]
hidden_units = [int(input_size)] + list(self.hidden_units)
self.kernels = [self.add_weight(name='kernel' + str(i),
shape=(
hidden_units[i], hidden_units[i + 1]),
initializer=glorot_uniform(
seed=self.seed),
regularizer=l2(self.l2_reg),
trainable=True) for i in range(len(self.hidden_units))]
self.bias = [self.add_weight(name='bias' + str(i),
shape=(self.hidden_units[i],),
initializer=Zeros(),
trainable=True) for i in range(len(self.hidden_units))]
if self.use_bn:
self.bn_layers = [tf.keras.layers.BatchNormalization() for _ in range(len(self.hidden_units))]
self.dropout_layers = [tf.keras.layers.Dropout(self.dropout_rate, seed=self.seed + i) for i in
range(len(self.hidden_units))]
self.activation_layers = [activation_layer(self.activation) for _ in range(len(self.hidden_units))]
if self.output_activation:
self.activation_layers[-1] = activation_layer(self.output_activation)
super(DNN, self).build(input_shape)
def call(self, inputs, training=True, **kwargs):
deep_input = inputs
for i in range(len(self.hidden_units)):
fc = tf.nn.bias_add(tf.tensordot(
deep_input, self.kernels[i], axes=(-1, 0)), self.bias[i])
if self.use_bn:
fc = self.bn_layers[i](fc, training=training)
try:
fc = self.activation_layers[i](fc, training=training)
except TypeError as e:
print("make sure the activation function use training flag properly", e)
fc = self.activation_layers[i](fc)
fc = self.dropout_layers[i](fc, training=training)
deep_input = fc
return deep_input
class Linear(Layer):
def __init__(self, l2_reg=.0, use_bias=False, seed=48, **kwargs):
self.l2_reg = l2_reg
self.use_bias = use_bias
self.seed = seed
super().__init__(**kwargs)
def build(self, input_shape):
self.kernel = self.add_weight(
name='linear_kernel',
shape=(input_shape[-1], 1),
initializer=glorot_normal(self.seed),
regularizer=l2(self.l2_reg),
trainable=True,
)
if self.use_bias:
self.bias = self.add_weight(
name='linear_bais',
shape=(1, ),
initializer=Zeros(),
trainable=True
)
super(Linear, self).build(input_shape)
def call(self, inputs, **kwargs):
linear_logits = tf.tensordot(inputs, self.kernel, axes=1)
if self.use_bias:
linear_logits += self.bias
return linear_logits
class GateNN(Layer):
def __init__(self, hidden_units, activation='relu', l2_reg=.0, dropout_rate=.0, use_bn=False,
output_activation='sigmoid', seed=48, **kwargs):
self.hidden_units = hidden_units
self.activation = activation
self.l2_reg = l2_reg
self.dropout_rate = dropout_rate
self.use_bn = use_bn
self.output_activation = output_activation
self.seed = seed
super(GateNN, self).__init__(**kwargs)
def build(self, input_shape):
input_size = input_shape[-1]
hidden_units = [int(input_size)] + list(self.hidden_units)
self.kernels = [self.add_weight(name='kernel' + str(i),
shape=(
hidden_units[i], hidden_units[i + 1]),
initializer=glorot_uniform(
seed=self.seed),
regularizer=l2(self.l2_reg),
trainable=True) for i in range(len(self.hidden_units))]
self.bias = [self.add_weight(name='bias' + str(i),
shape=(self.hidden_units[i],),
initializer=Zeros(),
trainable=True) for i in range(len(self.hidden_units))]
self.activation_layers = [activation_layer(self.activation) for _ in range(len(self.hidden_units))]
if self.output_activation:
self.activation_layers[-1] = activation_layer(self.output_activation)
super(GateNN, self).build(input_shape)
def call(self, inputs, training=True, **kwargs):
deep_input = inputs
for i in range(len(self.hidden_units)):
fc = tf.nn.bias_add(tf.tensordot(
deep_input, self.kernels[i], axes=(-1, 0)), self.bias[i])
try:
fc = self.activation_layers[i](fc, training=training)
except TypeError as e:
print("make sure the activation function use training flag properly", e)
fc = self.activation_layers[i](fc)
deep_input = fc
return deep_input
class PPNet(Layer):
def __init__(self, ppnet_size, hidden_units, activation='relu', l2_reg=.0, dropout_rate=.0, use_bn=False,
output_activation=None, seed=48, **kwargs):
self.ppnet_size = ppnet_size
self.hidden_units = hidden_units
self.activation = activation
self.l2_reg = l2_reg
self.dropout_rate = dropout_rate
self.use_bn = use_bn
self.output_activation = output_activation
self.seed = seed
super(PPNet, self).__init__(**kwargs)
def build(self, input_shape):
input_size = input_shape[0][-1]
hidden_units = [int(input_size)] + list(self.hidden_units)
self.gate_nn_layers = [
GateNN(hidden_units=[self.ppnet_size, hidden_units[i]],
activation='relu',
output_activation='sigmoid',
l2_reg=self.l2_reg,
seed=self.seed)
for i in range(len(self.hidden_units))
]
self.kernels = [self.add_weight(name='kernel' + str(i),
shape=(hidden_units[i], hidden_units[i + 1]),
initializer=glorot_uniform(
seed=self.seed),
regularizer=l2(self.l2_reg),
trainable=True) for i in range(len(self.hidden_units))]
self.bias = [self.add_weight(name='bias' + str(i),
shape=(self.hidden_units[i],),
initializer=Zeros(),
trainable=True) for i in range(len(self.hidden_units))]
if self.use_bn:
self.bn_layers = [tf.keras.layers.BatchNormalization() for _ in range(len(self.hidden_units))]
self.dropout_layers = [tf.keras.layers.Dropout(self.dropout_rate, seed=self.seed + i) for i in
range(len(self.hidden_units))]
self.activation_layers = [activation_layer(self.activation) for _ in range(len(self.hidden_units))]
if self.output_activation:
self.activation_layers[-1] = activation_layer(self.output_activation)
super(PPNet, self).build(input_shape)
def call(self, inputs, training=True, **kwargs):
deep_input, ppnet_input = inputs
for i in range(len(self.hidden_units)):
ppnet_scale = self.gate_nn_layers[i](ppnet_input)
deep_input = deep_input * ppnet_scale * 2
fc = tf.nn.bias_add(tf.tensordot(
deep_input, self.kernels[i], axes=(-1, 0)), self.bias[i])
if self.use_bn:
fc = self.bn_layers[i](fc, training=training)
try:
fc = self.activation_layers[i](fc, training=training)
except TypeError as e:
print("make sure the activation function use training flag properly", e)
fc = self.activation_layers[i](fc)
fc = self.dropout_layers[i](fc, training=training)
deep_input = fc
return deep_input

View File

@@ -0,0 +1,41 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File : embedding.py
# @Author: xLyons
# @IDE PyCharm
# @Time : 2021/8/27
from tensorflow.keras.layers import Embedding
from tensorflow.keras.regularizers import l2
def create_embed_dict(sparse_feature_columns, embed_l2_reg):
sparse_embed_dict = {}
for feat in sparse_feature_columns:
feat_embed_name = feat.embed_name
if feat_embed_name not in sparse_embed_dict.keys():
embed_layer = Embedding(
input_dim=feat.vocab_size,
input_length=1,
output_dim=feat.embed_dim,
embeddings_initializer=feat.embed_init,
embeddings_regularizer=l2(embed_l2_reg)
)
embed_layer.trainable = True
sparse_embed_dict[feat_embed_name] = embed_layer
return sparse_embed_dict
def embedding_lookup(sparse_embed_dict, feat_inputs, sparse_feature_columns, query_features=(), to_list=False):
feat_embed_outputs = {}
for feat in sparse_feature_columns:
feat_name = feat.name
if len(query_features) == 0 or feat_name in query_features:
feat_input = feat_inputs[feat_name]
feat_embed_outputs[feat_name] = sparse_embed_dict[feat.embed_name](feat_input)
if to_list:
return list(feat_embed_outputs.values())
return feat_embed_outputs

View File

@@ -0,0 +1,32 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @file : interaction.py
# @Author: xLyons
# @IDE : PyCharm
# @Time : 2021/9/15
import tensorflow as tf
from tensorflow.keras.layers import Layer
class FMCross(Layer):
def __init__(self, **kwargs):
super(FMCross, self).__init__(**kwargs)
def build(self, input_shape):
if len(input_shape) != 3:
raise ValueError("Unexpected inputs dimensions % d,\
expect to be 3 dimensions" % (len(input_shape)))
super(FMCross, self).build(input_shape) # Be sure to call this somewhere!
def call(self, inputs, **kwargs):
square_of_sum = tf.square(tf.reduce_sum(inputs, axis=1, keepdims=True)) # None, 1, dim
sum_of_square = tf.reduce_sum(inputs * inputs, axis=1, keepdims=True) # None, 1, dim
cross_term = square_of_sum - sum_of_square
cross_term = 0.5 * tf.reduce_sum(cross_term, axis=2, keepdims=False) # None, 1
return cross_term

View File

@@ -0,0 +1,6 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @File : __init__.py.py
# @Author: xLyons
# @IDE : PyCharm
# @Time : 2022/1/27

View File

@@ -0,0 +1,102 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File : feature_columns.py
# @Author: xLyons
# @IDE PyCharm
# @Time : 2022/1/27
import copy
import tensorflow as tf
from collections import OrderedDict
from tensorflow.keras.layers import Input, Flatten, Concatenate
from tensorflow.keras.initializers import RandomNormal, Zeros
from tensorflow.keras.layers.experimental.preprocessing import StringLookup
from layers.core import Linear
from layers.embedding import create_embed_dict, embedding_lookup
class SparseFeat(object):
def __init__(self, name, embed_dim, vocab_size, dtype, embed_name=None, seed=48):
self.name = name
self.vocab_size = vocab_size
self.embed_dim = embed_dim
self.embed_init = RandomNormal(mean=0.0, stddev=0.01, seed=seed)
self.dtype = dtype
self.embed_name = embed_name if embed_name else name
super(SparseFeat, self).__init__()
class DenseFeat(object):
def __init__(self, name, dimension, dtype=None):
self.name = name
self.dimension = dimension
self.dtype = dtype
super(DenseFeat, self).__init__()
def build_feature_inputs(feature_columns):
feat_inputs = OrderedDict()
for feat in feature_columns:
if isinstance(feat, SparseFeat):
sparse_inputs = Input(shape=(1, ),
name=feat.name,
dtype=feat.dtype)
feat_inputs[feat.name] = sparse_inputs
elif isinstance(feat, DenseFeat):
dense_inputs = Input(shape=(feat.dimension, ),
name=feat.name,
dtype=feat.dtype)
feat_inputs[feat.name] = dense_inputs
else:
raise TypeError("Invalid feature column type,got", type(feat))
return feat_inputs
def build_feature_coding_model(all_data, sparse_features):
feature_vocab_dict = dict()
for feat in sparse_features:
string_model = StringLookup(vocabulary=all_data[feat].unique(),
mask_token=None)
feature_vocab_dict[feat] = string_model
return feature_vocab_dict
def get_dense_inputs(feat_inputs, feature_columns, concat_flag=True):
dense_inputs = []
for feat in feature_columns:
if isinstance(feat, DenseFeat):
dense_inputs.append(feat_inputs[feat.name])
if concat_flag:
dense_inputs = tf.concat(dense_inputs, axis=-1)
return dense_inputs
def get_linear_logit(feat_inputs, feature_columns, linear_l2_reg=.0, embed_l2_reg=1e-5, use_bias=True, seed=48,):
linear_features = copy.deepcopy(feature_columns)
for feat in linear_features:
if isinstance(feat, SparseFeat):
feat.embed_dim = 1
feat.embed_init = Zeros()
sparse_feature_columns = list(
filter(lambda x: isinstance(x, SparseFeat), linear_features)) if feature_columns else []
sparse_embed_dict = create_embed_dict(sparse_feature_columns, embed_l2_reg)
sparse_embed_list = embedding_lookup(sparse_embed_dict, feat_inputs, sparse_feature_columns, to_list=True)
dense_inputs = get_dense_inputs(feat_inputs, linear_features, concat_flag=True)
sparse_embed_inputs = Flatten()(Concatenate(axis=-1)(sparse_embed_list))
linear_inputs = tf.concat([dense_inputs, sparse_embed_inputs], axis=-1)
linear_logit = Linear(linear_l2_reg, use_bias, seed)(linear_inputs)
return linear_logit

View File

@@ -0,0 +1,6 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @File : __init__.py.py
# @Author: xLyons
# @IDE : PyCharm
# @Time : 2022/1/27

View File

@@ -0,0 +1,63 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @File : deepfm.py
# @Author: xLyons
# @IDE : PyCharm
# @Time : 2022/1/27
from tensorflow.keras.layers import Dense
from layers.core import DNN
from layers.interaction import FMCross
from model_tools.feature_columns import *
def DeepFM(feature_columns,
dnn_hidden_units,
embed_l2_reg=1e-5,
linear_l2_reg=1e-5,
linear_use_bias=True,
dnn_l2_reg=1e-5,
dnn_drop_rate=.0,
dnn_use_bn=False,
dnn_activation='relu',
seed=48):
feat_inputs = build_feature_inputs(feature_columns)
inputs_list = list(feat_inputs.values())
sparse_feature_columns = list(
filter(lambda x: isinstance(x, SparseFeat), feature_columns)) if feature_columns else []
sparse_embed_dict = create_embed_dict(sparse_feature_columns, embed_l2_reg)
sparse_embed_list = embedding_lookup(sparse_embed_dict, feat_inputs, sparse_feature_columns, to_list=True)
dense_inputs = get_dense_inputs(feat_inputs, feature_columns, concat_flag=True)
linear_logit = get_linear_logit(feat_inputs=feat_inputs,
feature_columns=feature_columns,
linear_l2_reg=linear_l2_reg,
embed_l2_reg=embed_l2_reg,
use_bias=linear_use_bias,
seed=seed)
fm_inputs = Concatenate(axis=1)(sparse_embed_list)
fm_logit = FMCross()(fm_inputs)
sparse_embed_inputs = Flatten()(Concatenate(axis=-1)(sparse_embed_list))
dnn_inputs = tf.concat([dense_inputs, sparse_embed_inputs], axis=-1)
dnn_logit = DNN(hidden_units=dnn_hidden_units,
activation=dnn_activation,
l2_reg=dnn_l2_reg,
dropout_rate=dnn_drop_rate,
use_bn=dnn_use_bn
)(dnn_inputs)
dnn_logit = Dense(units=1,
use_bias=False,
kernel_initializer=tf.keras.initializers.glorot_uniform(seed=seed)
)(dnn_logit)
final_outputs = tf.nn.sigmoid(linear_logit + fm_logit + dnn_logit)
model = tf.keras.models.Model(inputs=inputs_list, outputs=final_outputs)
return model

View File

@@ -0,0 +1,77 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @File : deepfm_ppnet.py
# @Author: xLyons
# @IDE : PyCharm
# @Time : 2022/2/8
from tensorflow.keras.layers import Dense
from layers.core import PPNet
from layers.interaction import FMCross
from model_tools.feature_columns import *
def DeepFM_PPNet(
feature_columns,
ppnet_size,
ppnet_features,
dnn_hidden_units,
embed_l2_reg=1e-5,
linear_l2_reg=1e-5,
linear_use_bias=True,
dnn_l2_reg=1e-5,
dnn_drop_rate=.0,
dnn_use_bn=False,
dnn_activation='relu',
seed=48):
feat_inputs = build_feature_inputs(feature_columns)
inputs_list = list(feat_inputs.values())
sparse_feature_columns = list(
filter(lambda x: isinstance(x, SparseFeat), feature_columns)) if feature_columns else []
sparse_embed_dict = create_embed_dict(sparse_feature_columns, embed_l2_reg)
sparse_embed_list = embedding_lookup(sparse_embed_dict, feat_inputs, sparse_feature_columns, to_list=True)
dense_inputs = get_dense_inputs(feat_inputs, feature_columns, concat_flag=True)
linear_logit = get_linear_logit(feat_inputs=feat_inputs,
feature_columns=feature_columns,
linear_l2_reg=linear_l2_reg,
embed_l2_reg=embed_l2_reg,
use_bias=linear_use_bias,
seed=seed)
fm_inputs = Concatenate(axis=1)(sparse_embed_list)
fm_logit = FMCross()(fm_inputs)
sparse_embed_inputs = Flatten()(Concatenate(axis=-1)(sparse_embed_list))
dnn_inputs = tf.concat([dense_inputs, sparse_embed_inputs], axis=-1)
ppnet_feature_columns = list(
filter(lambda x: x.name in ppnet_features, feature_columns)) if feature_columns else []
ppnet_embed_list = embedding_lookup(sparse_embed_dict, feat_inputs, ppnet_feature_columns, to_list=True)
ppnet_inputs = Flatten()(Concatenate(axis=-1)(ppnet_embed_list))
# stop gradient propagation
ppnet_inputs = tf.stop_gradient(ppnet_inputs)
dnn_logit = PPNet(
ppnet_size=ppnet_size,
hidden_units=dnn_hidden_units,
activation=dnn_activation,
l2_reg=dnn_l2_reg,
dropout_rate=dnn_drop_rate,
use_bn=dnn_use_bn
)([dnn_inputs, ppnet_inputs])
dnn_logit = Dense(units=1,
use_bias=False,
kernel_initializer=tf.keras.initializers.glorot_uniform(seed=seed)
)(dnn_logit)
final_outputs = tf.nn.sigmoid(linear_logit + fm_logit + dnn_logit)
model = tf.keras.models.Model(inputs=inputs_list, outputs=final_outputs)
return model

View File

@@ -0,0 +1,273 @@
# 1. 数据集介绍
原始数据集共包含3个实验时存放在目录`rank/examples/dataset/raw_data/`下。
+ **user_info_5w.csv**
+ 该文件共包含了5万条用户的个人数据
+ 特征分别包括了:['user_id', 'device', 'os', 'province', 'city', 'age', 'gender']
+ 各特征的含义为:['用户id', '设备名称', '操作系统', '所在省', '所在市', '年龄', '性别']
+ **doc_info.txt**
+ 该文件包含了所有新闻的特征数据;
+ 各特征的含义为:['文档id', '标题', '发文时间', '图片数量', '一级分类', '二级分类', '关键词']
+ **train_data_5w.csv**
+ 该文件为用户点击数据包含了5万个用户在过去13天的点击数据
+ 各特征的含义为:['用户id', '文档id', '展现时间', '网路环境', '刷新次数', '展现位置', '是否点击', '消费时长(秒)']
# 2. 数据处理
数据处理的文件存放在`rank/examples/dataset/data_process/`下。
## 2.1 训练集和测试集的划分
训练集和测试集的划分程序为:**train&test_data_split.py**
+ 训练集将所有用户在前12天的点击行为划为训练集
+ 测试集:
+ 将所有用户在第13天的点击行为化为测试集
+ 测试集丢弃特征:消费时长(秒),展现位置,是否点击;
+ 测试标签:
+ 将测试集的真实标签单独进行存储;
其他特征处理:
+ 选中消费时长小于0的样本并将其消费时长设置为0
+ 对所有样本的展现时间进行了格式处理,并新增了特征展现日期;
+ 新增特征index目的是为了后续对测试集进行评估
## 2.2 特征处理
### 2.2.1 用户和文档特征
用户数据和文档数据的处理程序为:**user&doc_data_process.py**
**用户数据处理**
+ 性别特征:原始的用户性别数据为用户对应不同性别的概率,这里直接将概率最高的性别作为用户的实际性别;
+ 年龄数据:原始的用户年龄数据为用户对应不同年龄段的概率,这里将概率最高的年龄段作为用户所处的年龄段;
**文档数据处理**
+ 发文时间:对于部分发文时间异常或者为空的数据,使用已有文档中最早的发文时间进行填充;
+ 发文日期:将文档的发文时间,提取出对应的发文日期(年-月-日);
+ 图片数量:对部分异常的脏数据,使用 $0$ 进行填充处理;
+ 二级分类:对于存在多个二级分类的文档,从其选取一个作为其二级分类。优先选择不等于一级分类的二级分类,对于二级分类为空的文档使用一级分类进行填充;
+ 关键词:
+ 每篇文档均存在多个关键词,每个关键词也会对应一个权重,这里选取权重最高的关键词作为文章的唯一关键词;
+ 文档中不同的关键词及权重是采用逗号进行隔开的,但部分关键词本来就包含逗号(如==你好,李焕英==),故相关函数还对此进行了特殊处理;
### 2.2.2 统计特征
统计特征的生成的程序为:**news_data_process.py**
+ 从文档发文到展示的时间差:对于每一个样本,统计对应文档从发文到展示的日期差;
+ 用户特征统计:
+ 统计每个用户过去几天,所展现的文档总数;
+ 统计每个用户过去几天,在不同类别文档(一级分类)上的展现总数;
+ 统计每个用户过去几天,在不同类别文档(二级分类)上的展现总数;
+ 统计每个用户过去几天,整体的点击率;
+ 统计每个用户过去几天,对不同类别文档(一级分类)上的点击率;
+ 统计每个用户过去几天,对不同类别文档(二级分类)上的点击率;
+ 统计每个用户过去几天,消费时长的总和;
+ 统计每个用户过去几天,在不同类别文档(一级分类)上的总消费时长;
+ 统计每个用户过去几天,在不同类别文档(二级分类)上的总消费时长;
+ 统计每个用户过去几天,在不同类别文档(一级分类)上的平均消费时长;
+ 统计每个用户过去几天,在不同类别文档(二级分类)上的平均消费时长;
+ 统计每个用户过去几天,在不同类别文档(一级分类)上的消费时长的方差;
+ 统计每个用户过去几天,在不同类别文档(二级分类)上的消费时长的方差;
+ 文档特征统计:
+ 统计每篇文档在过去几天,被展示的总次数;
+ 统计各类别(一级分类)文档在过去几天,被展示的总次数;
+ 统计各类别(二级分类)文档在过去几天,被展示的总次数;
+ 统计每篇文档在过去几天,平均的被点击率;
+ 统计各类别(一级分类)文档在过去几天,平均被点击率;
+ 统计各类别(二级分类)文档在过去几天,平均被点击率;
+ 统计每篇文档在过去几天,总的被消费时长;
+ 统计每篇文档在过去几天,平均的被消费时长;
+ 统计每篇文档在过去几天,被消费时长的方差;
+ 统计各类别(一级分类)文档在过去几天,总的被消费时长;
+ 统计各类别(一级分类)文档在过去几天,平均的被消费时长;
+ 统计各类别(一级分类)文档在过去几天,被消费时长的方差;
+ 统计各类别(二级分类)文档在过去几天,总的被消费时长;
+ 统计各类别(二级分类)文档在过去几天,平均的被消费时长;
+ 统计各类别(二级分类)文档在过去几天,被消费时长的方差;
### 2.2.3 特征归一化和编码
+ 连续型特征:
+ 连续型特征包含的主要是统计特征,这里对于空值统一使用 $0$ 进行填充;
+ 之后,对所有的连续型特征进行对数归一化, 即取 $log$ 对数;
+ 类别型特征:
+ 类别型特征这里主要是通过 $LabelEncoder$ 的方式进行编码,以便后续模型处理为相应的 $Embedding$
# 3. 排序模型
排序模型的执行程序存放在`rank/examples/`下,分别为`deepfm_news.py``deepfm_ppnet_news.py`
## 3.1 DeepFM
DeepFM是2017年由华为与哈工大提出的排序模型模型主要包含两部分FM部分+Deep部分。
+ FM部分对不同特征域的Embedding进行两两交叉,以加强模型在浅层网络中的特征组合能力。
+ Deep部分多层感知机网络模型。通过对特征各个维度进行充分的特征交叉组合来学习到更多非线性以及组合特征的信息。
论文链接:[[DeepFM: A Factorization-Machine based Neural Network for CTR Prediction (arxiv.org)](https://arxiv.org/abs/1703.04247)
**实验结果**
1. 参数设置
```yaml
# data para
seed: 48
# model para
embed_dim: 32
drop_rate: 0.5
use_bn: Ture
hidden_units: [64, 128, 64]
# compile para
learning_rate: 0.001
epochs: 20
batch_size: 2048
val_splite: 0.1
patience: 5
restore_best_weights: True
```
2. 运行结果
```bash
Epoch 1/20
2653/2653 [==============================] - 47s 17ms/step - loss: 0.3921 - auc: 0.7287 - val_loss: 0.3628 - val_auc: 0.7588
Epoch 2/20
2653/2653 [==============================] - 44s 17ms/step - loss: 0.3619 - auc: 0.7616 - val_loss: 0.3581 - val_auc: 0.7647
Epoch 3/20
2653/2653 [==============================] - 44s 17ms/step - loss: 0.3569 - auc: 0.7705 - val_loss: 0.3561 - val_auc: 0.7682
Epoch 4/20
2653/2653 [==============================] - 47s 18ms/step - loss: 0.3548 - auc: 0.7754 - val_loss: 0.3557 - val_auc: 0.7699
Epoch 5/20
2653/2653 [==============================] - 47s 18ms/step - loss: 0.3540 - auc: 0.7777 - val_loss: 0.3560 - val_auc: 0.7702
Epoch 6/20
2653/2653 [==============================] - 46s 18ms/step - loss: 0.3536 - auc: 0.7788 - val_loss: 0.3557 - val_auc: 0.7708
Epoch 7/20
2653/2653 [==============================] - 45s 17ms/step - loss: 0.3533 - auc: 0.7797 - val_loss: 0.3556 - val_auc: 0.7714
Epoch 8/20
2653/2653 [==============================] - 45s 17ms/step - loss: 0.3532 - auc: 0.7802 - val_loss: 0.3558 - val_auc: 0.7712
Epoch 9/20
2653/2653 [==============================] - 46s 17ms/step - loss: 0.3530 - auc: 0.7806 - val_loss: 0.3560 - val_auc: 0.7713
Epoch 10/20
2653/2653 [==============================] - 46s 17ms/step - loss: 0.3530 - auc: 0.7808 - val_loss: 0.3560 - val_auc: 0.7711
Epoch 11/20
2653/2653 [==============================] - 45s 17ms/step - loss: 0.3529 - auc: 0.7811 - val_loss: 0.3560 - val_auc: 0.7715
Epoch 12/20
2653/2653 [==============================] - 46s 17ms/step - loss: 0.3528 - auc: 0.7813 - val_loss: 0.3557 - val_auc: 0.7718
251/251 [==============================] - 3s 11ms/step - loss: 0.3719 - auc: 0.7508
test AUC: 0.750784
```
## 3.2 DeepFM+PPNet
将DeepFM模型中DNN 模块替换为PPNet模型
+ 在语音识别领域中2014 年和 2016 年提出的 LHUC 算法learning hidden unit contributions核心思想是做说话人自适应speaker adaptation其中一个关键突破是在 DNN 网络中为每个说话人学习一个特定的隐式单位贡献hidden unit contributions来提升不同说话人的语音识别效果。
+ 借鉴 LHUC 的思想,快手推荐团队在精排模型上展开了尝试。经过多次迭代优化,推荐团队设计出一种 gating 机制,可以增加 DNN 网络参数个性化并能够让模型快速收敛。快手把这种模型叫做 **PPNetParameter Personalized Net**
参考链接:[1.9万亿参数量,快手落地业界首个万亿参数推荐精排模型](https://mp.weixin.qq.com/s?__biz=MzA3MzI4MjgzMw==&idx=4&mid=2650808254&scene=21&sn=6c295c8306b7339858f8ecfadfc9d698#wechat_redirect)
**实验结果:**
1. 参数设置
```yaml
# data para
seed: 48
# model para
embed_dim: 32
drop_rate: 0.5
ppnet_size: 256
ppnet_features: ['user_id', '一级分类', '年龄']
use_bn: Ture
hidden_units: [64, 128, 64]
# compile para
learning_rate: 0.001
epochs: 20
batch_size: 2048
val_splite: 0.1
patience: 5
restore_best_weights: True
```
2. 运行结果
```bash
Epoch 1/20
2653/2653 [==============================] - 56s 20ms/step - loss: 0.3929 - auc: 0.7303 - val_loss: 0.3648 - val_auc: 0.7568
Epoch 2/20
2653/2653 [==============================] - 53s 20ms/step - loss: 0.3620 - auc: 0.7622 - val_loss: 0.3591 - val_auc: 0.7651
Epoch 3/20
2653/2653 [==============================] - 55s 21ms/step - loss: 0.3578 - auc: 0.7706 - val_loss: 0.3580 - val_auc: 0.7690
Epoch 4/20
2653/2653 [==============================] - 53s 20ms/step - loss: 0.3560 - auc: 0.7755 - val_loss: 0.3587 - val_auc: 0.7701
Epoch 5/20
2653/2653 [==============================] - 54s 20ms/step - loss: 0.3551 - auc: 0.7787 - val_loss: 0.3580 - val_auc: 0.7706
Epoch 6/20
2653/2653 [==============================] - 55s 21ms/step - loss: 0.3545 - auc: 0.7809 - val_loss: 0.3587 - val_auc: 0.7718
Epoch 7/20
2653/2653 [==============================] - 54s 20ms/step - loss: 0.3541 - auc: 0.7829 - val_loss: 0.3586 - val_auc: 0.7720
Epoch 8/20
2653/2653 [==============================] - 53s 20ms/step - loss: 0.3538 - auc: 0.7842 - val_loss: 0.3587 - val_auc: 0.7721
251/251 [==============================] - 4s 13ms/step - loss: 0.3686 - auc: 0.7543
test AUC: 0.754304
```
# 4. 程序执行
```bash
# 数据预处理
1. user&doc_data_process.py
2. train&test_data_split.py
3. news_data_process.py
# 排序模型
4. deepfm_news.py 或 deepfm_ppnet_news.py
```
# Requirements
- Tensorflow2.5 (GPU)
- Numpy
- Pandas
- Swifter
- Sklearn

View File

@@ -0,0 +1,39 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @File : run_deepfm.py
# @Author: xLyons
# @IDE : PyCharm
# @Time : 2022/1/27
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC
from models.deepfm import DeepFM
def run(train_data, test_data, feature_columns, args):
# 1. 建模
model = DeepFM(feature_columns=feature_columns,
dnn_hidden_units=args.hidden_units,
dnn_drop_rate=args.drop_rate,
dnn_use_bn=args.use_bn)
# 2. 编译
model.compile(optimizer=Adam(learning_rate=args.learning_rate),
loss="binary_crossentropy",
metrics=[AUC()])
model.summary()
# 3. 训练
model.fit(train_data[0],
train_data[1],
batch_size=args.batch_size,
epochs=args.epochs,
callbacks=[EarlyStopping(monitor='val_loss',
patience=args.patience,
mode='min',
restore_best_weights=args.restore_best_weights)],
validation_split=args.val_splite,
)
# 4. 测试
print('test AUC: %f' % model.evaluate(test_data[0], test_data[1], batch_size=args.batch_size)[1])

View File

@@ -0,0 +1,42 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @File : run_deepfm_ppnet.py
# @Author: xLyons
# @IDE : PyCharm
# @Time : 2022/2/8
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC
from models.deepfm_ppnet import DeepFM_PPNet
def run(train_data, test_data, feature_columns, args):
# 1. 建模
model = DeepFM_PPNet(
feature_columns=feature_columns,
ppnet_size=args.ppnet_size,
ppnet_features=args.ppnet_features,
dnn_hidden_units=args.hidden_units,
dnn_drop_rate=args.drop_rate,
dnn_use_bn=args.use_bn)
# 2. 编译
model.compile(optimizer=Adam(learning_rate=args.learning_rate),
loss="binary_crossentropy",
metrics=[AUC()])
model.summary()
# 3. 训练
model.fit(train_data[0],
train_data[1],
batch_size=args.batch_size,
epochs=args.epochs,
callbacks=[EarlyStopping(monitor='val_loss',
patience=args.patience,
mode='min',
restore_best_weights=args.restore_best_weights)],
validation_split=args.val_splite,
)
# 4. 测试
print('test AUC: %f' % model.evaluate(test_data[0], test_data[1], batch_size=args.batch_size)[1])

View File

@@ -0,0 +1,44 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @File : data_compression.py
# @Author: xLyons
# @IDE : PyCharm
# @Time : 2022/2/7
import gc
import numpy as np
from tqdm.auto import tqdm
def reduce_mem(df):
start_mem = df.memory_usage().sum() / 1024 ** 2
print(f'开始进行内存压缩...')
for col in tqdm(df.columns):
col_type = df[col].dtypes
if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
elif str(col_type)[:5] == 'float':
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024 ** 2
print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
gc.collect()
return df

View File

@@ -0,0 +1,16 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @file : set_device.py
# @Author: xLyons
# @IDE : PyCharm
# @Time : 2022/2/7
import tensorflow as tf
def set_GPU():
gpus = tf.config.experimental.list_physical_devices('GPU')
print(gpus)
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)

View File

@@ -0,0 +1,20 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @file : set_parament.py
# @Author: xLyons
# @IDE : PyCharm
# @Time : 2022/2/7
import yaml
from collections import namedtuple
def get_args(yaml_path):
with open(yaml_path, 'r', encoding='utf-8') as f:
para_dict = yaml.load(f.read(), Loader=yaml.FullLoader)
ps = namedtuple('parser', list(para_dict.keys()))
args = ps(**para_dict)
f.close()
return args