Binary file not shown.
Binary file not shown.
@@ -0,0 +1,53 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
# @File : create_ctr_data.py
|
||||
# @Author: xLyons
|
||||
# @IDE : PyCharm
|
||||
# @Time : 2022/2/7
|
||||
|
||||
import pickle
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.utils import shuffle
|
||||
|
||||
from model_tools.feature_columns import SparseFeat, DenseFeat
|
||||
|
||||
|
||||
def create_ctr_data(data_path, args, use_dict=True):
|
||||
with open(data_path + 'data.pkl', 'rb') as f:
|
||||
all_data, feature_info = pickle.load(f)
|
||||
f.close()
|
||||
|
||||
# 训练数据和测试数据
|
||||
all_data = shuffle(all_data)
|
||||
train_df = all_data[all_data['是否点击'] != -1]
|
||||
test_df = all_data[all_data['是否点击'] == -1]
|
||||
# 测试数据的标签
|
||||
test_labels = pd.read_pickle(data_path + 'test_label.pkl')
|
||||
test_labels = pd.merge(test_df[['index']], test_labels, how='left', on=['index'])
|
||||
|
||||
all_features = feature_info['dense_features'] + feature_info['sparse_features']
|
||||
if use_dict:
|
||||
train_inputs = {name: np.array(train_df[name].tolist()) for name in all_features}
|
||||
train_labels = train_df['是否点击'].values
|
||||
test_inputs = {name: np.array(test_df[name].tolist()) for name in all_features}
|
||||
test_labels = test_labels['是否点击'].values
|
||||
else:
|
||||
train_inputs = [np.array(train_df[name]) for name in all_features]
|
||||
train_labels = train_df['是否点击'].values
|
||||
test_inputs = [np.array(test_df[name]) for name in all_features]
|
||||
test_labels = test_labels['是否点击'].values
|
||||
|
||||
features_columns = [DenseFeat(name=feat,
|
||||
dimension=1,
|
||||
dtype='float32',)
|
||||
for feat in feature_info['dense_features']]
|
||||
|
||||
features_columns += [SparseFeat(name=feat,
|
||||
embed_name=feat,
|
||||
embed_dim=args.embed_dim,
|
||||
vocab_size=all_data[feat].max()+1,
|
||||
dtype='int32',)
|
||||
for feat in feature_info['sparse_features']]
|
||||
|
||||
return (train_inputs, train_labels), (test_inputs, test_labels), features_columns
|
||||
@@ -0,0 +1,163 @@
|
||||
#!/usrbin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
# @File : news_data_process.py
|
||||
# @Author: xLyons
|
||||
# @IDE : PyCharm
|
||||
# @Time : 2022/2/7
|
||||
|
||||
import os
|
||||
import gc
|
||||
import swifter
|
||||
import pickle
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from tqdm.auto import tqdm
|
||||
from sklearn.preprocessing import LabelEncoder, StandardScaler
|
||||
from utils.data_compression import reduce_mem
|
||||
|
||||
|
||||
def get_statistical_features(all_data, past_day=7):
|
||||
# 统计新闻从发文到展示的日期差
|
||||
temp = all_data['展现日期'] - all_data['发文日期']
|
||||
all_data['从发文到展现的日期差'] = temp.dt.days
|
||||
all_data.loc[all_data['从发文到展现的日期差'] < 0, '从发文到展现的日期差'] = 0
|
||||
all_data.fillna(value={'从发文到展现的日期差': 0}, inplace=True)
|
||||
|
||||
statis_dense_columns = ['从发文到展现的日期差']
|
||||
|
||||
dates = all_data['展现日期'].unique()
|
||||
dates.sort()
|
||||
date_num = len(dates)
|
||||
date_map = dict(zip(dates, range(date_num)))
|
||||
all_data['展现日期_idx'] = all_data['展现日期'].map(date_map)
|
||||
|
||||
train_data = all_data[all_data['是否点击'] != -1]
|
||||
|
||||
# ===================================================================================
|
||||
for feat in tqdm([['user_id'], ['item_id'], ['一级分类'], ['二级分类'],
|
||||
['user_id', '一级分类'], ['user_id', '二级分类']]):
|
||||
res_arr = []
|
||||
name = f'过去{past_day}天_特征({"_".join(feat)})_展现总数'
|
||||
statis_dense_columns.append(name)
|
||||
|
||||
for day in range(0, date_num):
|
||||
train_data_temp = train_data[
|
||||
(train_data['展现日期_idx'] >= day-past_day) & (train_data['展现日期_idx'] < day)]
|
||||
train_data_temp = train_data_temp.groupby(feat)['item_id'].agg([
|
||||
(name, 'count')]).reset_index()
|
||||
train_data_temp['展现日期_idx'] = day
|
||||
res_arr.append(train_data_temp)
|
||||
stat_all_data = pd.concat(res_arr)
|
||||
all_data = all_data.merge(stat_all_data, how='left', on=feat + ['展现日期_idx'])
|
||||
|
||||
target = '是否点击'
|
||||
for feat in tqdm([['user_id'], ['item_id'], ['一级分类'], ['二级分类'],
|
||||
['user_id', '一级分类'], ['user_id', '二级分类']]):
|
||||
res_arr = []
|
||||
name_mean = f'过去{past_day}天_特征({"_".join(feat)})_点击率mean'
|
||||
name_sum = f'过去{past_day}天_特征({"_".join(feat)})_点击总数sum'
|
||||
|
||||
statis_dense_columns.append(name_mean)
|
||||
statis_dense_columns.append(name_sum)
|
||||
|
||||
for day in range(0, date_num):
|
||||
train_data_temp = train_data[
|
||||
(train_data['展现日期_idx'] >= day-past_day) & (train_data['展现日期_idx'] < day)]
|
||||
train_data_temp = train_data_temp.groupby(feat)[target].agg(
|
||||
[(name_mean, 'mean'), (name_sum, 'sum')]).reset_index()
|
||||
train_data_temp['展现日期_idx'] = day
|
||||
res_arr.append(train_data_temp)
|
||||
stat_all_data = pd.concat(res_arr)
|
||||
all_data = all_data.merge(stat_all_data, how='left', on=feat + ['展现日期_idx'])
|
||||
|
||||
target = '消费时长(秒)'
|
||||
for feat in tqdm([['user_id'], ['item_id'], ['一级分类'], ['二级分类'],
|
||||
['user_id', '一级分类'], ['user_id', '二级分类']]):
|
||||
res_arr = []
|
||||
name_mean = f'过去{past_day}天_特征({"_".join(feat)})_消费时长mean'
|
||||
name_std = f'过去{past_day}天_特征({"_".join(feat)})_消费时长std'
|
||||
name_sum = f'过去{past_day}天_特征({"_".join(feat)})_消费时长sum'
|
||||
statis_dense_columns.append(name_mean)
|
||||
statis_dense_columns.append(name_std)
|
||||
statis_dense_columns.append(name_sum)
|
||||
|
||||
for day in range(0, date_num):
|
||||
train_data_temp = train_data[
|
||||
(train_data['展现日期_idx'] >= day-past_day) & (train_data['展现日期_idx'] < day)]
|
||||
train_data_temp = train_data_temp.groupby(feat)[target].agg(
|
||||
[(name_mean, 'mean'), (name_std, 'std'), (name_sum, 'sum')]
|
||||
).reset_index()
|
||||
train_data_temp['展现日期_idx'] = day
|
||||
res_arr.append(train_data_temp)
|
||||
stat_all_data = pd.concat(res_arr)
|
||||
all_data = all_data.merge(stat_all_data, how='left', on=feat + ['展现日期_idx'])
|
||||
|
||||
return all_data, statis_dense_columns
|
||||
|
||||
|
||||
def main():
|
||||
raw_data_path = '../raw_data'
|
||||
new_data_path = '../new_data'
|
||||
os.makedirs(new_data_path, exist_ok=True)
|
||||
|
||||
train_data_path = os.path.join(raw_data_path, 'train_data.pkl')
|
||||
test_data_path = os.path.join(raw_data_path, 'test_data.pkl')
|
||||
|
||||
train_data = pd.read_pickle(train_data_path)
|
||||
test_data = pd.read_pickle(test_data_path)
|
||||
test_data['是否点击'] = -1
|
||||
all_data = pd.concat([train_data, test_data])
|
||||
|
||||
# 1. 合并用户特征
|
||||
user_path = os.path.join(new_data_path, 'user_info_5w.pkl')
|
||||
user_info = pd.read_pickle(user_path)
|
||||
all_data = all_data.merge(
|
||||
user_info[['user_id', '设备名称', '操作系统', '所在省', '所在市', '年龄', '性别']],
|
||||
how='left', on='user_id'
|
||||
)
|
||||
del user_info
|
||||
gc.collect()
|
||||
|
||||
# 2. 合并文档特征
|
||||
doc_path = os.path.join(new_data_path, 'doc_info.pkl')
|
||||
doc_info = pd.read_pickle(doc_path)
|
||||
all_data = all_data.merge(
|
||||
doc_info[['item_id', '一级分类', '二级分类', '关键词', '图片数量', '发文时间', '发文日期']],
|
||||
how='left', on='item_id'
|
||||
)
|
||||
del doc_info
|
||||
gc.collect()
|
||||
|
||||
# 3. 获取统计特征
|
||||
all_data, statis_dense_columns = get_statistical_features(all_data)
|
||||
|
||||
# 4. 连续特征处理
|
||||
base_dense_columns = ['刷新次数', '图片数量']
|
||||
dense_columns = base_dense_columns + statis_dense_columns
|
||||
|
||||
all_data.fillna(value={feat: 0 for feat in dense_columns}, inplace=True)
|
||||
# sc = StandardScaler()
|
||||
# all_data[dense_columns] = sc.fit_transform(all_data[dense_columns])
|
||||
for feat in dense_columns:
|
||||
all_data[feat] = np.log(1 + all_data[feat])
|
||||
|
||||
# 5. 离散特征处理
|
||||
sparse_columns = ['user_id', 'item_id', '网路环境', '设备名称', '操作系统', '展现位置',
|
||||
'所在省', '所在市', '年龄', '性别', '一级分类', '二级分类', '关键词']
|
||||
for feat in sparse_columns:
|
||||
lb = LabelEncoder()
|
||||
all_data[feat] = lb.fit_transform(all_data[feat].astype(str))
|
||||
|
||||
all_data = reduce_mem(all_data)
|
||||
feature_info = {'dense_features': dense_columns,
|
||||
'sparse_features': sparse_columns}
|
||||
file = [all_data, feature_info]
|
||||
file_save_path = os.path.join(new_data_path, 'data.pkl')
|
||||
with open(file_save_path, 'wb') as f:
|
||||
pickle.dump(file, f)
|
||||
f.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,46 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
# @File : train&test_data_split.py
|
||||
# @Author: xLyons
|
||||
# @IDE : PyCharm
|
||||
# @Time : 2022/2/16
|
||||
|
||||
|
||||
import os
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def main():
|
||||
raw_data_path = '../raw_data'
|
||||
new_data_path = '../new_data'
|
||||
|
||||
# 1. 数据读取
|
||||
all_data_path = os.path.join(raw_data_path, 'train_data_5w.csv')
|
||||
all_data = pd.read_csv(all_data_path, sep='\t', index_col=0) # .sample(n=100000)
|
||||
all_data.columns = ['user_id', 'item_id', '展现时间', '网路环境', '刷新次数', '展现位置', '是否点击', '消费时长(秒)']
|
||||
print(f'样本总数为:{all_data.shape[0]}')
|
||||
|
||||
# 2. 数据处理
|
||||
all_data.loc[all_data['消费时长(秒)'] < 0, '消费时长(秒)'] = 0
|
||||
all_data['展现时间'] = pd.to_datetime(
|
||||
all_data.loc[:, '展现时间'], utc=True, unit='ms').dt.tz_convert('Asia/Shanghai')
|
||||
all_data['展现日期'] = all_data['展现时间'].dt.date
|
||||
all_data['index'] = range(all_data.shape[0])
|
||||
|
||||
dates = all_data['展现日期'].unique()
|
||||
dates.sort()
|
||||
# 3. 训练、测试数据集划分
|
||||
train_data = all_data[all_data['展现日期'] != dates[-1]]
|
||||
test_data = all_data[all_data['展现日期'] == dates[-1]]
|
||||
test_label = test_data[['index', '是否点击']]
|
||||
|
||||
# 4. 测试集处理
|
||||
test_data = test_data.drop(columns=['消费时长(秒)', '展现位置', '是否点击'])
|
||||
|
||||
train_data.to_pickle(os.path.join(raw_data_path, 'train_data.pkl'))
|
||||
test_data.to_pickle(os.path.join(raw_data_path, 'test_data.pkl'))
|
||||
test_label.to_pickle(os.path.join(new_data_path, 'test_label.pkl'))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,103 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
# @File : user&doc_data_process.py
|
||||
# @Author: xLyons
|
||||
# @IDE : PyCharm
|
||||
# @Time : 2022/2/9
|
||||
|
||||
import os
|
||||
import swifter
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
|
||||
def prob2val(feat_info):
|
||||
# 判断是否为空
|
||||
if feat_info == feat_info:
|
||||
prob_list = [values.split(':') for values in feat_info.split(',')]
|
||||
prob_list = sorted(prob_list, key=lambda x: float(x[1]))
|
||||
return prob_list[-1][0]
|
||||
else:
|
||||
return np.NaN
|
||||
|
||||
|
||||
def get_second_title(x):
|
||||
if x['二级分类'] == x['二级分类']:
|
||||
second_titles = x['二级分类'].split('/')
|
||||
for title in second_titles:
|
||||
# 跳过异常数据
|
||||
if title == 'A_0_24:0.447656,A_25_29:0.243809,A_30_39:0.076268,A_40+:0.232267':
|
||||
continue
|
||||
# 优先返回不等于一级分类的二级分类
|
||||
if title != x['一级分类']:
|
||||
return title
|
||||
|
||||
return x['一级分类']
|
||||
|
||||
|
||||
def get_key_word(feat_info):
|
||||
if feat_info == feat_info and isinstance(feat_info, str):
|
||||
key_word_list = [values.split(':') for values in feat_info.replace('^', '').split(',')]
|
||||
|
||||
new_list = []
|
||||
last_elem = ''
|
||||
for idx, values in enumerate(key_word_list):
|
||||
if len(values) == 1:
|
||||
last_elem = values[0] if last_elem == '' else ','.join([last_elem, values[0]])
|
||||
continue
|
||||
if len(values) > 2:
|
||||
# 将类似于‘你好,李焕英’这种关键词重新进行拼接
|
||||
# 这类关键词由于存在逗号,在获取key_word_list时被误分开了
|
||||
values[0] = ':'.join(values[:-1])
|
||||
|
||||
values[0] = values[0] if last_elem == '' else ','.join([last_elem, values[0]])
|
||||
new_list.append(values)
|
||||
last_elem = ''
|
||||
|
||||
return new_list[-1][0]
|
||||
else:
|
||||
return np.NaN
|
||||
|
||||
|
||||
def main():
|
||||
raw_data_path = '../raw_data'
|
||||
new_data_path = '../new_data'
|
||||
os.makedirs(new_data_path, exist_ok=True)
|
||||
|
||||
# 1. 处理用户文件
|
||||
user_path = os.path.join(raw_data_path, 'user_info_5w.csv')
|
||||
user_info = pd.read_csv(user_path, sep='\t', index_col=0)
|
||||
user_info.columns = ['user_id', '设备名称', '操作系统', '所在省', '所在市', '年龄', '性别']
|
||||
|
||||
user_info['年龄'] = [prob2val(age_info) for age_info in tqdm(user_info['年龄'])]
|
||||
user_info['性别'] = [prob2val(sex_info) for sex_info in tqdm(user_info['性别'])]
|
||||
|
||||
user_info.to_pickle(os.path.join(new_data_path, 'user_info_5w.pkl'))
|
||||
|
||||
# 2. 处理文档文件
|
||||
doc_path = os.path.join(raw_data_path, 'doc_info.txt')
|
||||
doc_info = pd.read_table(doc_path, sep='\t', low_memory=False, header=None)
|
||||
doc_info.columns = ['item_id', '标题', '发文时间', '图片数量', '一级分类', '二级分类', '关键词']
|
||||
|
||||
# 处理异常的发文时间数据
|
||||
condition_row = (doc_info['发文时间'].isnull()) | (doc_info['发文时间'] == 'Android')
|
||||
time_fill_value = doc_info.loc[~condition_row, '发文时间'].swifter.apply(lambda x: int(x[:10])).astype('int').min()
|
||||
doc_info.loc[condition_row, '发文时间'] = str(time_fill_value)
|
||||
|
||||
doc_info['发文时间'] = pd.to_datetime(
|
||||
doc_info.loc[:, '发文时间'], utc=True, unit='ms').dt.tz_convert('Asia/Shanghai')
|
||||
doc_info['发文日期'] = doc_info['发文时间'].dt.date
|
||||
|
||||
doc_info['图片数量'] = doc_info.loc[:, '图片数量'].swifter.apply(
|
||||
lambda x: 0 if (x in ['上海', '云南', '山东'] or x != x) else int(x))
|
||||
|
||||
doc_info['二级分类'] = doc_info.loc[:, ['一级分类', '二级分类']].swifter.apply(get_second_title, axis=1)
|
||||
doc_info['关键词'] = [get_key_word(words) for words in tqdm(doc_info['关键词'])]
|
||||
|
||||
doc_info.to_pickle(os.path.join(new_data_path, 'doc_info.pkl'))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,13 @@
|
||||
原始数据集共包含3个,实验时存放在目录`rank/examples/dataset/raw_data/`下。
|
||||
|
||||
+ **user_info_5w.csv**
|
||||
+ 该文件共包含了5万条用户的个人数据;
|
||||
+ 特征分别包括了:['user_id', 'device', 'os', 'province', 'city', 'age', 'gender'];
|
||||
+ 各特征的含义为:['用户id', '设备名称', '操作系统', '所在省', '所在市', '年龄', '性别'];
|
||||
+ **doc_info.txt**
|
||||
+ 该文件包含了所有新闻的特征数据;
|
||||
+ 各特征的含义为:['文档id', '标题', '发文时间', '图片数量', '一级分类', '二级分类', '关键词'];
|
||||
|
||||
+ **train_data_5w.csv**
|
||||
+ 该文件为用户点击数据,包含了5万个用户在过去13天的点击数据;
|
||||
+ 各特征的含义为:['用户id', '文档id', '展现时间', '网路环境', '刷新次数', '展现位置', '是否点击', '消费时长(秒)'];
|
||||
@@ -0,0 +1,29 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
# @File : deepfm_news.py
|
||||
# @Author: xLyons
|
||||
# @IDE : PyCharm
|
||||
# @Time : 2022/1/27
|
||||
|
||||
import argparse
|
||||
|
||||
from run_train import run_deepfm
|
||||
from utils.set_parament import get_args
|
||||
from dataset.data_process.create_ctr_data import create_ctr_data
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser(description='Model Parameter')
|
||||
parser.add_argument('--yaml_path',
|
||||
default='./set_para/deepfm_news.yaml',
|
||||
required=False)
|
||||
parser.add_argument('--data_path',
|
||||
default='./dataset/new_data/',
|
||||
required=False)
|
||||
parse_args = parser.parse_args()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = get_args(parse_args.yaml_path)
|
||||
train_data, test_data, feature_info = create_ctr_data(parse_args.data_path, args)
|
||||
|
||||
run_deepfm.run(train_data, test_data, feature_info, args)
|
||||
@@ -0,0 +1,29 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
# @File : deepfm_ppnet_news.py
|
||||
# @Author: xLyons
|
||||
# @IDE : PyCharm
|
||||
# @Time : 2022/2/8
|
||||
|
||||
import argparse
|
||||
|
||||
from run_train import run_deepfm_ppnet
|
||||
from utils.set_parament import get_args
|
||||
from dataset.data_process.create_ctr_data import create_ctr_data
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser(description='Model Parameter')
|
||||
parser.add_argument('--yaml_path',
|
||||
default='./set_para/deepfm_ppnet_news.yaml',
|
||||
required=False)
|
||||
parser.add_argument('--data_path',
|
||||
default='./dataset/new_data/',
|
||||
required=False)
|
||||
parse_args = parser.parse_args()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = get_args(parse_args.yaml_path)
|
||||
train_data, test_data, feature_info = create_ctr_data(parse_args.data_path, args)
|
||||
|
||||
run_deepfm_ppnet.run(train_data, test_data, feature_info, args)
|
||||
@@ -0,0 +1,14 @@
|
||||
# data para
|
||||
seed: 48
|
||||
# model para
|
||||
embed_dim: 32
|
||||
drop_rate: 0.5
|
||||
use_bn: Ture
|
||||
hidden_units: [64, 128, 64]
|
||||
# compile para
|
||||
learning_rate: 0.001
|
||||
epochs: 1
|
||||
batch_size: 2048
|
||||
val_splite: 0.1
|
||||
patience: 5
|
||||
restore_best_weights: True
|
||||
@@ -0,0 +1,16 @@
|
||||
# data para
|
||||
seed: 48
|
||||
# model para
|
||||
embed_dim: 32
|
||||
drop_rate: 0.5
|
||||
ppnet_size: 256
|
||||
ppnet_features: ['user_id', '一级分类', '年龄']
|
||||
use_bn: Ture
|
||||
hidden_units: [64, 128, 64]
|
||||
# compile para
|
||||
learning_rate: 0.001
|
||||
epochs: 1
|
||||
batch_size: 2048
|
||||
val_splite: 0.1
|
||||
patience: 5
|
||||
restore_best_weights: True
|
||||
@@ -0,0 +1,6 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
# @File : __init__.py.py
|
||||
# @Author: xLyons
|
||||
# @IDE : PyCharm
|
||||
# @Time : 2022/1/27
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,24 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
# @File : activation.py
|
||||
# @Author: xLyons
|
||||
# @IDE : PyCharm
|
||||
# @Time : 2022/1/27
|
||||
|
||||
import tensorflow as tf
|
||||
from tensorflow.keras.initializers import Zeros
|
||||
from tensorflow.keras.layers import Layer
|
||||
|
||||
unicode = str
|
||||
|
||||
|
||||
def activation_layer(activation):
|
||||
if isinstance(activation, (str, unicode)):
|
||||
act_layer = tf.keras.layers.Activation(activation)
|
||||
elif issubclass(activation, Layer):
|
||||
act_layer = activation()
|
||||
else:
|
||||
raise ValueError(
|
||||
"Invalid activation,found %s.You should use a str or a Activation Layer Class." % (activation))
|
||||
return act_layer
|
||||
|
||||
238
codes/news_recsys/news_rec_server/recprocess/rank/layers/core.py
Normal file
238
codes/news_recsys/news_rec_server/recprocess/rank/layers/core.py
Normal file
@@ -0,0 +1,238 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @File : core.py
|
||||
# @Author: xLyons
|
||||
# @IDE :PyCharm
|
||||
# @Time : 2022/1/27
|
||||
|
||||
import tensorflow as tf
|
||||
from tensorflow.keras.layers import Layer, Dense, Embedding, BatchNormalization, Dropout
|
||||
from tensorflow.keras.initializers import glorot_normal, Zeros, glorot_uniform
|
||||
from tensorflow.keras.regularizers import l2
|
||||
|
||||
from layers.activation import activation_layer
|
||||
|
||||
|
||||
class DNN(Layer):
|
||||
def __init__(self, hidden_units, activation='relu', l2_reg=.0, dropout_rate=.0, use_bn=False,
|
||||
output_activation=None, seed=48, **kwargs):
|
||||
self.hidden_units = hidden_units
|
||||
self.activation = activation
|
||||
self.l2_reg = l2_reg
|
||||
self.dropout_rate = dropout_rate
|
||||
self.use_bn = use_bn
|
||||
self.output_activation = output_activation
|
||||
self.seed = seed
|
||||
|
||||
super(DNN, self).__init__(**kwargs)
|
||||
|
||||
def build(self, input_shape):
|
||||
input_size = input_shape[-1]
|
||||
|
||||
hidden_units = [int(input_size)] + list(self.hidden_units)
|
||||
self.kernels = [self.add_weight(name='kernel' + str(i),
|
||||
shape=(
|
||||
hidden_units[i], hidden_units[i + 1]),
|
||||
initializer=glorot_uniform(
|
||||
seed=self.seed),
|
||||
regularizer=l2(self.l2_reg),
|
||||
trainable=True) for i in range(len(self.hidden_units))]
|
||||
self.bias = [self.add_weight(name='bias' + str(i),
|
||||
shape=(self.hidden_units[i],),
|
||||
initializer=Zeros(),
|
||||
trainable=True) for i in range(len(self.hidden_units))]
|
||||
if self.use_bn:
|
||||
self.bn_layers = [tf.keras.layers.BatchNormalization() for _ in range(len(self.hidden_units))]
|
||||
|
||||
self.dropout_layers = [tf.keras.layers.Dropout(self.dropout_rate, seed=self.seed + i) for i in
|
||||
range(len(self.hidden_units))]
|
||||
|
||||
self.activation_layers = [activation_layer(self.activation) for _ in range(len(self.hidden_units))]
|
||||
|
||||
if self.output_activation:
|
||||
self.activation_layers[-1] = activation_layer(self.output_activation)
|
||||
|
||||
super(DNN, self).build(input_shape)
|
||||
|
||||
def call(self, inputs, training=True, **kwargs):
|
||||
deep_input = inputs
|
||||
|
||||
for i in range(len(self.hidden_units)):
|
||||
fc = tf.nn.bias_add(tf.tensordot(
|
||||
deep_input, self.kernels[i], axes=(-1, 0)), self.bias[i])
|
||||
|
||||
if self.use_bn:
|
||||
fc = self.bn_layers[i](fc, training=training)
|
||||
try:
|
||||
fc = self.activation_layers[i](fc, training=training)
|
||||
except TypeError as e:
|
||||
print("make sure the activation function use training flag properly", e)
|
||||
fc = self.activation_layers[i](fc)
|
||||
|
||||
fc = self.dropout_layers[i](fc, training=training)
|
||||
deep_input = fc
|
||||
|
||||
return deep_input
|
||||
|
||||
|
||||
class Linear(Layer):
|
||||
def __init__(self, l2_reg=.0, use_bias=False, seed=48, **kwargs):
|
||||
self.l2_reg = l2_reg
|
||||
self.use_bias = use_bias
|
||||
self.seed = seed
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def build(self, input_shape):
|
||||
self.kernel = self.add_weight(
|
||||
name='linear_kernel',
|
||||
shape=(input_shape[-1], 1),
|
||||
initializer=glorot_normal(self.seed),
|
||||
regularizer=l2(self.l2_reg),
|
||||
trainable=True,
|
||||
)
|
||||
if self.use_bias:
|
||||
self.bias = self.add_weight(
|
||||
name='linear_bais',
|
||||
shape=(1, ),
|
||||
initializer=Zeros(),
|
||||
trainable=True
|
||||
)
|
||||
|
||||
super(Linear, self).build(input_shape)
|
||||
|
||||
def call(self, inputs, **kwargs):
|
||||
linear_logits = tf.tensordot(inputs, self.kernel, axes=1)
|
||||
if self.use_bias:
|
||||
linear_logits += self.bias
|
||||
|
||||
return linear_logits
|
||||
|
||||
|
||||
class GateNN(Layer):
|
||||
def __init__(self, hidden_units, activation='relu', l2_reg=.0, dropout_rate=.0, use_bn=False,
|
||||
output_activation='sigmoid', seed=48, **kwargs):
|
||||
self.hidden_units = hidden_units
|
||||
self.activation = activation
|
||||
self.l2_reg = l2_reg
|
||||
self.dropout_rate = dropout_rate
|
||||
self.use_bn = use_bn
|
||||
self.output_activation = output_activation
|
||||
self.seed = seed
|
||||
|
||||
super(GateNN, self).__init__(**kwargs)
|
||||
|
||||
def build(self, input_shape):
|
||||
input_size = input_shape[-1]
|
||||
|
||||
hidden_units = [int(input_size)] + list(self.hidden_units)
|
||||
self.kernels = [self.add_weight(name='kernel' + str(i),
|
||||
shape=(
|
||||
hidden_units[i], hidden_units[i + 1]),
|
||||
initializer=glorot_uniform(
|
||||
seed=self.seed),
|
||||
regularizer=l2(self.l2_reg),
|
||||
trainable=True) for i in range(len(self.hidden_units))]
|
||||
self.bias = [self.add_weight(name='bias' + str(i),
|
||||
shape=(self.hidden_units[i],),
|
||||
initializer=Zeros(),
|
||||
trainable=True) for i in range(len(self.hidden_units))]
|
||||
|
||||
self.activation_layers = [activation_layer(self.activation) for _ in range(len(self.hidden_units))]
|
||||
|
||||
if self.output_activation:
|
||||
self.activation_layers[-1] = activation_layer(self.output_activation)
|
||||
|
||||
super(GateNN, self).build(input_shape)
|
||||
|
||||
def call(self, inputs, training=True, **kwargs):
|
||||
deep_input = inputs
|
||||
|
||||
for i in range(len(self.hidden_units)):
|
||||
fc = tf.nn.bias_add(tf.tensordot(
|
||||
deep_input, self.kernels[i], axes=(-1, 0)), self.bias[i])
|
||||
|
||||
try:
|
||||
fc = self.activation_layers[i](fc, training=training)
|
||||
except TypeError as e:
|
||||
print("make sure the activation function use training flag properly", e)
|
||||
fc = self.activation_layers[i](fc)
|
||||
|
||||
deep_input = fc
|
||||
|
||||
return deep_input
|
||||
|
||||
|
||||
class PPNet(Layer):
|
||||
def __init__(self, ppnet_size, hidden_units, activation='relu', l2_reg=.0, dropout_rate=.0, use_bn=False,
|
||||
output_activation=None, seed=48, **kwargs):
|
||||
self.ppnet_size = ppnet_size
|
||||
self.hidden_units = hidden_units
|
||||
self.activation = activation
|
||||
self.l2_reg = l2_reg
|
||||
self.dropout_rate = dropout_rate
|
||||
self.use_bn = use_bn
|
||||
self.output_activation = output_activation
|
||||
self.seed = seed
|
||||
|
||||
super(PPNet, self).__init__(**kwargs)
|
||||
|
||||
def build(self, input_shape):
|
||||
input_size = input_shape[0][-1]
|
||||
hidden_units = [int(input_size)] + list(self.hidden_units)
|
||||
|
||||
self.gate_nn_layers = [
|
||||
GateNN(hidden_units=[self.ppnet_size, hidden_units[i]],
|
||||
activation='relu',
|
||||
output_activation='sigmoid',
|
||||
l2_reg=self.l2_reg,
|
||||
seed=self.seed)
|
||||
for i in range(len(self.hidden_units))
|
||||
]
|
||||
self.kernels = [self.add_weight(name='kernel' + str(i),
|
||||
shape=(hidden_units[i], hidden_units[i + 1]),
|
||||
initializer=glorot_uniform(
|
||||
seed=self.seed),
|
||||
regularizer=l2(self.l2_reg),
|
||||
trainable=True) for i in range(len(self.hidden_units))]
|
||||
self.bias = [self.add_weight(name='bias' + str(i),
|
||||
shape=(self.hidden_units[i],),
|
||||
initializer=Zeros(),
|
||||
trainable=True) for i in range(len(self.hidden_units))]
|
||||
if self.use_bn:
|
||||
self.bn_layers = [tf.keras.layers.BatchNormalization() for _ in range(len(self.hidden_units))]
|
||||
|
||||
self.dropout_layers = [tf.keras.layers.Dropout(self.dropout_rate, seed=self.seed + i) for i in
|
||||
range(len(self.hidden_units))]
|
||||
|
||||
self.activation_layers = [activation_layer(self.activation) for _ in range(len(self.hidden_units))]
|
||||
|
||||
if self.output_activation:
|
||||
self.activation_layers[-1] = activation_layer(self.output_activation)
|
||||
|
||||
super(PPNet, self).build(input_shape)
|
||||
|
||||
def call(self, inputs, training=True, **kwargs):
|
||||
deep_input, ppnet_input = inputs
|
||||
|
||||
for i in range(len(self.hidden_units)):
|
||||
ppnet_scale = self.gate_nn_layers[i](ppnet_input)
|
||||
deep_input = deep_input * ppnet_scale * 2
|
||||
fc = tf.nn.bias_add(tf.tensordot(
|
||||
deep_input, self.kernels[i], axes=(-1, 0)), self.bias[i])
|
||||
|
||||
if self.use_bn:
|
||||
fc = self.bn_layers[i](fc, training=training)
|
||||
try:
|
||||
fc = self.activation_layers[i](fc, training=training)
|
||||
except TypeError as e:
|
||||
print("make sure the activation function use training flag properly", e)
|
||||
fc = self.activation_layers[i](fc)
|
||||
|
||||
fc = self.dropout_layers[i](fc, training=training)
|
||||
deep_input = fc
|
||||
|
||||
return deep_input
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,41 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @File : embedding.py
|
||||
# @Author: xLyons
|
||||
# @IDE :PyCharm
|
||||
# @Time : 2021/8/27
|
||||
|
||||
from tensorflow.keras.layers import Embedding
|
||||
from tensorflow.keras.regularizers import l2
|
||||
|
||||
|
||||
def create_embed_dict(sparse_feature_columns, embed_l2_reg):
|
||||
sparse_embed_dict = {}
|
||||
for feat in sparse_feature_columns:
|
||||
feat_embed_name = feat.embed_name
|
||||
if feat_embed_name not in sparse_embed_dict.keys():
|
||||
embed_layer = Embedding(
|
||||
input_dim=feat.vocab_size,
|
||||
input_length=1,
|
||||
output_dim=feat.embed_dim,
|
||||
embeddings_initializer=feat.embed_init,
|
||||
embeddings_regularizer=l2(embed_l2_reg)
|
||||
)
|
||||
embed_layer.trainable = True
|
||||
sparse_embed_dict[feat_embed_name] = embed_layer
|
||||
|
||||
return sparse_embed_dict
|
||||
|
||||
|
||||
def embedding_lookup(sparse_embed_dict, feat_inputs, sparse_feature_columns, query_features=(), to_list=False):
|
||||
feat_embed_outputs = {}
|
||||
for feat in sparse_feature_columns:
|
||||
feat_name = feat.name
|
||||
if len(query_features) == 0 or feat_name in query_features:
|
||||
feat_input = feat_inputs[feat_name]
|
||||
feat_embed_outputs[feat_name] = sparse_embed_dict[feat.embed_name](feat_input)
|
||||
|
||||
if to_list:
|
||||
return list(feat_embed_outputs.values())
|
||||
|
||||
return feat_embed_outputs
|
||||
@@ -0,0 +1,32 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
# @file : interaction.py
|
||||
# @Author: xLyons
|
||||
# @IDE : PyCharm
|
||||
# @Time : 2021/9/15
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from tensorflow.keras.layers import Layer
|
||||
|
||||
|
||||
class FMCross(Layer):
|
||||
def __init__(self, **kwargs):
|
||||
super(FMCross, self).__init__(**kwargs)
|
||||
|
||||
def build(self, input_shape):
|
||||
if len(input_shape) != 3:
|
||||
raise ValueError("Unexpected inputs dimensions % d,\
|
||||
expect to be 3 dimensions" % (len(input_shape)))
|
||||
|
||||
super(FMCross, self).build(input_shape) # Be sure to call this somewhere!
|
||||
|
||||
def call(self, inputs, **kwargs):
|
||||
square_of_sum = tf.square(tf.reduce_sum(inputs, axis=1, keepdims=True)) # None, 1, dim
|
||||
sum_of_square = tf.reduce_sum(inputs * inputs, axis=1, keepdims=True) # None, 1, dim
|
||||
|
||||
cross_term = square_of_sum - sum_of_square
|
||||
cross_term = 0.5 * tf.reduce_sum(cross_term, axis=2, keepdims=False) # None, 1
|
||||
|
||||
return cross_term
|
||||
|
||||
@@ -0,0 +1,6 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
# @File : __init__.py.py
|
||||
# @Author: xLyons
|
||||
# @IDE : PyCharm
|
||||
# @Time : 2022/1/27
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,102 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# @File : feature_columns.py
|
||||
# @Author: xLyons
|
||||
# @IDE :PyCharm
|
||||
# @Time : 2022/1/27
|
||||
|
||||
|
||||
import copy
|
||||
import tensorflow as tf
|
||||
|
||||
from collections import OrderedDict
|
||||
from tensorflow.keras.layers import Input, Flatten, Concatenate
|
||||
from tensorflow.keras.initializers import RandomNormal, Zeros
|
||||
from tensorflow.keras.layers.experimental.preprocessing import StringLookup
|
||||
|
||||
from layers.core import Linear
|
||||
from layers.embedding import create_embed_dict, embedding_lookup
|
||||
|
||||
|
||||
class SparseFeat(object):
|
||||
def __init__(self, name, embed_dim, vocab_size, dtype, embed_name=None, seed=48):
|
||||
self.name = name
|
||||
self.vocab_size = vocab_size
|
||||
self.embed_dim = embed_dim
|
||||
self.embed_init = RandomNormal(mean=0.0, stddev=0.01, seed=seed)
|
||||
self.dtype = dtype
|
||||
|
||||
self.embed_name = embed_name if embed_name else name
|
||||
|
||||
super(SparseFeat, self).__init__()
|
||||
|
||||
|
||||
class DenseFeat(object):
|
||||
def __init__(self, name, dimension, dtype=None):
|
||||
self.name = name
|
||||
self.dimension = dimension
|
||||
self.dtype = dtype
|
||||
|
||||
super(DenseFeat, self).__init__()
|
||||
|
||||
|
||||
def build_feature_inputs(feature_columns):
|
||||
feat_inputs = OrderedDict()
|
||||
for feat in feature_columns:
|
||||
if isinstance(feat, SparseFeat):
|
||||
sparse_inputs = Input(shape=(1, ),
|
||||
name=feat.name,
|
||||
dtype=feat.dtype)
|
||||
feat_inputs[feat.name] = sparse_inputs
|
||||
elif isinstance(feat, DenseFeat):
|
||||
dense_inputs = Input(shape=(feat.dimension, ),
|
||||
name=feat.name,
|
||||
dtype=feat.dtype)
|
||||
feat_inputs[feat.name] = dense_inputs
|
||||
else:
|
||||
raise TypeError("Invalid feature column type,got", type(feat))
|
||||
|
||||
return feat_inputs
|
||||
|
||||
|
||||
def build_feature_coding_model(all_data, sparse_features):
|
||||
feature_vocab_dict = dict()
|
||||
for feat in sparse_features:
|
||||
string_model = StringLookup(vocabulary=all_data[feat].unique(),
|
||||
mask_token=None)
|
||||
feature_vocab_dict[feat] = string_model
|
||||
|
||||
return feature_vocab_dict
|
||||
|
||||
|
||||
def get_dense_inputs(feat_inputs, feature_columns, concat_flag=True):
|
||||
dense_inputs = []
|
||||
for feat in feature_columns:
|
||||
if isinstance(feat, DenseFeat):
|
||||
dense_inputs.append(feat_inputs[feat.name])
|
||||
|
||||
if concat_flag:
|
||||
dense_inputs = tf.concat(dense_inputs, axis=-1)
|
||||
|
||||
return dense_inputs
|
||||
|
||||
|
||||
def get_linear_logit(feat_inputs, feature_columns, linear_l2_reg=.0, embed_l2_reg=1e-5, use_bias=True, seed=48,):
|
||||
linear_features = copy.deepcopy(feature_columns)
|
||||
for feat in linear_features:
|
||||
if isinstance(feat, SparseFeat):
|
||||
feat.embed_dim = 1
|
||||
feat.embed_init = Zeros()
|
||||
|
||||
sparse_feature_columns = list(
|
||||
filter(lambda x: isinstance(x, SparseFeat), linear_features)) if feature_columns else []
|
||||
sparse_embed_dict = create_embed_dict(sparse_feature_columns, embed_l2_reg)
|
||||
sparse_embed_list = embedding_lookup(sparse_embed_dict, feat_inputs, sparse_feature_columns, to_list=True)
|
||||
|
||||
dense_inputs = get_dense_inputs(feat_inputs, linear_features, concat_flag=True)
|
||||
sparse_embed_inputs = Flatten()(Concatenate(axis=-1)(sparse_embed_list))
|
||||
linear_inputs = tf.concat([dense_inputs, sparse_embed_inputs], axis=-1)
|
||||
|
||||
linear_logit = Linear(linear_l2_reg, use_bias, seed)(linear_inputs)
|
||||
|
||||
return linear_logit
|
||||
@@ -0,0 +1,6 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
# @File : __init__.py.py
|
||||
# @Author: xLyons
|
||||
# @IDE : PyCharm
|
||||
# @Time : 2022/1/27
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,63 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
# @File : deepfm.py
|
||||
# @Author: xLyons
|
||||
# @IDE : PyCharm
|
||||
# @Time : 2022/1/27
|
||||
|
||||
from tensorflow.keras.layers import Dense
|
||||
|
||||
from layers.core import DNN
|
||||
from layers.interaction import FMCross
|
||||
from model_tools.feature_columns import *
|
||||
|
||||
|
||||
def DeepFM(feature_columns,
|
||||
dnn_hidden_units,
|
||||
embed_l2_reg=1e-5,
|
||||
linear_l2_reg=1e-5,
|
||||
linear_use_bias=True,
|
||||
dnn_l2_reg=1e-5,
|
||||
dnn_drop_rate=.0,
|
||||
dnn_use_bn=False,
|
||||
dnn_activation='relu',
|
||||
seed=48):
|
||||
|
||||
feat_inputs = build_feature_inputs(feature_columns)
|
||||
inputs_list = list(feat_inputs.values())
|
||||
|
||||
sparse_feature_columns = list(
|
||||
filter(lambda x: isinstance(x, SparseFeat), feature_columns)) if feature_columns else []
|
||||
|
||||
sparse_embed_dict = create_embed_dict(sparse_feature_columns, embed_l2_reg)
|
||||
sparse_embed_list = embedding_lookup(sparse_embed_dict, feat_inputs, sparse_feature_columns, to_list=True)
|
||||
|
||||
dense_inputs = get_dense_inputs(feat_inputs, feature_columns, concat_flag=True)
|
||||
|
||||
linear_logit = get_linear_logit(feat_inputs=feat_inputs,
|
||||
feature_columns=feature_columns,
|
||||
linear_l2_reg=linear_l2_reg,
|
||||
embed_l2_reg=embed_l2_reg,
|
||||
use_bias=linear_use_bias,
|
||||
seed=seed)
|
||||
|
||||
fm_inputs = Concatenate(axis=1)(sparse_embed_list)
|
||||
fm_logit = FMCross()(fm_inputs)
|
||||
|
||||
sparse_embed_inputs = Flatten()(Concatenate(axis=-1)(sparse_embed_list))
|
||||
dnn_inputs = tf.concat([dense_inputs, sparse_embed_inputs], axis=-1)
|
||||
dnn_logit = DNN(hidden_units=dnn_hidden_units,
|
||||
activation=dnn_activation,
|
||||
l2_reg=dnn_l2_reg,
|
||||
dropout_rate=dnn_drop_rate,
|
||||
use_bn=dnn_use_bn
|
||||
)(dnn_inputs)
|
||||
dnn_logit = Dense(units=1,
|
||||
use_bias=False,
|
||||
kernel_initializer=tf.keras.initializers.glorot_uniform(seed=seed)
|
||||
)(dnn_logit)
|
||||
|
||||
final_outputs = tf.nn.sigmoid(linear_logit + fm_logit + dnn_logit)
|
||||
model = tf.keras.models.Model(inputs=inputs_list, outputs=final_outputs)
|
||||
|
||||
return model
|
||||
@@ -0,0 +1,77 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
# @File : deepfm_ppnet.py
|
||||
# @Author: xLyons
|
||||
# @IDE : PyCharm
|
||||
# @Time : 2022/2/8
|
||||
|
||||
|
||||
from tensorflow.keras.layers import Dense
|
||||
|
||||
from layers.core import PPNet
|
||||
from layers.interaction import FMCross
|
||||
from model_tools.feature_columns import *
|
||||
|
||||
|
||||
def DeepFM_PPNet(
|
||||
feature_columns,
|
||||
ppnet_size,
|
||||
ppnet_features,
|
||||
dnn_hidden_units,
|
||||
embed_l2_reg=1e-5,
|
||||
linear_l2_reg=1e-5,
|
||||
linear_use_bias=True,
|
||||
dnn_l2_reg=1e-5,
|
||||
dnn_drop_rate=.0,
|
||||
dnn_use_bn=False,
|
||||
dnn_activation='relu',
|
||||
seed=48):
|
||||
|
||||
feat_inputs = build_feature_inputs(feature_columns)
|
||||
inputs_list = list(feat_inputs.values())
|
||||
|
||||
sparse_feature_columns = list(
|
||||
filter(lambda x: isinstance(x, SparseFeat), feature_columns)) if feature_columns else []
|
||||
|
||||
sparse_embed_dict = create_embed_dict(sparse_feature_columns, embed_l2_reg)
|
||||
sparse_embed_list = embedding_lookup(sparse_embed_dict, feat_inputs, sparse_feature_columns, to_list=True)
|
||||
|
||||
dense_inputs = get_dense_inputs(feat_inputs, feature_columns, concat_flag=True)
|
||||
|
||||
linear_logit = get_linear_logit(feat_inputs=feat_inputs,
|
||||
feature_columns=feature_columns,
|
||||
linear_l2_reg=linear_l2_reg,
|
||||
embed_l2_reg=embed_l2_reg,
|
||||
use_bias=linear_use_bias,
|
||||
seed=seed)
|
||||
|
||||
fm_inputs = Concatenate(axis=1)(sparse_embed_list)
|
||||
fm_logit = FMCross()(fm_inputs)
|
||||
|
||||
sparse_embed_inputs = Flatten()(Concatenate(axis=-1)(sparse_embed_list))
|
||||
dnn_inputs = tf.concat([dense_inputs, sparse_embed_inputs], axis=-1)
|
||||
|
||||
ppnet_feature_columns = list(
|
||||
filter(lambda x: x.name in ppnet_features, feature_columns)) if feature_columns else []
|
||||
ppnet_embed_list = embedding_lookup(sparse_embed_dict, feat_inputs, ppnet_feature_columns, to_list=True)
|
||||
ppnet_inputs = Flatten()(Concatenate(axis=-1)(ppnet_embed_list))
|
||||
# stop gradient propagation
|
||||
ppnet_inputs = tf.stop_gradient(ppnet_inputs)
|
||||
|
||||
dnn_logit = PPNet(
|
||||
ppnet_size=ppnet_size,
|
||||
hidden_units=dnn_hidden_units,
|
||||
activation=dnn_activation,
|
||||
l2_reg=dnn_l2_reg,
|
||||
dropout_rate=dnn_drop_rate,
|
||||
use_bn=dnn_use_bn
|
||||
)([dnn_inputs, ppnet_inputs])
|
||||
dnn_logit = Dense(units=1,
|
||||
use_bias=False,
|
||||
kernel_initializer=tf.keras.initializers.glorot_uniform(seed=seed)
|
||||
)(dnn_logit)
|
||||
|
||||
final_outputs = tf.nn.sigmoid(linear_logit + fm_logit + dnn_logit)
|
||||
model = tf.keras.models.Model(inputs=inputs_list, outputs=final_outputs)
|
||||
|
||||
return model
|
||||
273
codes/news_recsys/news_rec_server/recprocess/rank/readme.md
Normal file
273
codes/news_recsys/news_rec_server/recprocess/rank/readme.md
Normal file
@@ -0,0 +1,273 @@
|
||||
# 1. 数据集介绍
|
||||
|
||||
原始数据集共包含3个,实验时存放在目录`rank/examples/dataset/raw_data/`下。
|
||||
|
||||
+ **user_info_5w.csv**
|
||||
+ 该文件共包含了5万条用户的个人数据;
|
||||
+ 特征分别包括了:['user_id', 'device', 'os', 'province', 'city', 'age', 'gender'];
|
||||
+ 各特征的含义为:['用户id', '设备名称', '操作系统', '所在省', '所在市', '年龄', '性别'];
|
||||
+ **doc_info.txt**
|
||||
+ 该文件包含了所有新闻的特征数据;
|
||||
+ 各特征的含义为:['文档id', '标题', '发文时间', '图片数量', '一级分类', '二级分类', '关键词'];
|
||||
|
||||
+ **train_data_5w.csv**
|
||||
+ 该文件为用户点击数据,包含了5万个用户在过去13天的点击数据;
|
||||
+ 各特征的含义为:['用户id', '文档id', '展现时间', '网路环境', '刷新次数', '展现位置', '是否点击', '消费时长(秒)'];
|
||||
|
||||
|
||||
|
||||
# 2. 数据处理
|
||||
|
||||
数据处理的文件存放在`rank/examples/dataset/data_process/`下。
|
||||
|
||||
## 2.1 训练集和测试集的划分
|
||||
|
||||
训练集和测试集的划分程序为:**train&test_data_split.py**
|
||||
|
||||
+ 训练集:将所有用户在前12天的点击行为划为训练集;
|
||||
+ 测试集:
|
||||
+ 将所有用户在第13天的点击行为化为测试集;
|
||||
+ 测试集丢弃特征:消费时长(秒),展现位置,是否点击;
|
||||
|
||||
+ 测试标签:
|
||||
+ 将测试集的真实标签单独进行存储;
|
||||
|
||||
其他特征处理:
|
||||
|
||||
+ 选中消费时长小于0的样本,并将其消费时长设置为0;
|
||||
+ 对所有样本的展现时间进行了格式处理,并新增了特征展现日期;
|
||||
+ 新增特征index,目的是为了后续对测试集进行评估;
|
||||
|
||||
## 2.2 特征处理
|
||||
|
||||
### 2.2.1 用户和文档特征
|
||||
|
||||
用户数据和文档数据的处理程序为:**user&doc_data_process.py**
|
||||
|
||||
**用户数据处理**
|
||||
|
||||
+ 性别特征:原始的用户性别数据为用户对应不同性别的概率,这里直接将概率最高的性别作为用户的实际性别;
|
||||
|
||||
+ 年龄数据:原始的用户年龄数据为用户对应不同年龄段的概率,这里将概率最高的年龄段作为用户所处的年龄段;
|
||||
|
||||
**文档数据处理**
|
||||
|
||||
+ 发文时间:对于部分发文时间异常或者为空的数据,使用已有文档中最早的发文时间进行填充;
|
||||
+ 发文日期:将文档的发文时间,提取出对应的发文日期(年-月-日);
|
||||
+ 图片数量:对部分异常的脏数据,使用 $0$ 进行填充处理;
|
||||
+ 二级分类:对于存在多个二级分类的文档,从其选取一个作为其二级分类。优先选择不等于一级分类的二级分类,对于二级分类为空的文档使用一级分类进行填充;
|
||||
+ 关键词:
|
||||
+ 每篇文档均存在多个关键词,每个关键词也会对应一个权重,这里选取权重最高的关键词作为文章的唯一关键词;
|
||||
+ 文档中不同的关键词及权重是采用逗号进行隔开的,但部分关键词本来就包含逗号(如==你好,李焕英==),故相关函数还对此进行了特殊处理;
|
||||
|
||||
### 2.2.2 统计特征
|
||||
|
||||
统计特征的生成的程序为:**news_data_process.py**
|
||||
|
||||
+ 从文档发文到展示的时间差:对于每一个样本,统计对应文档从发文到展示的日期差;
|
||||
+ 用户特征统计:
|
||||
|
||||
+ 统计每个用户过去几天,所展现的文档总数;
|
||||
+ 统计每个用户过去几天,在不同类别文档(一级分类)上的展现总数;
|
||||
+ 统计每个用户过去几天,在不同类别文档(二级分类)上的展现总数;
|
||||
+ 统计每个用户过去几天,整体的点击率;
|
||||
+ 统计每个用户过去几天,对不同类别文档(一级分类)上的点击率;
|
||||
+ 统计每个用户过去几天,对不同类别文档(二级分类)上的点击率;
|
||||
+ 统计每个用户过去几天,消费时长的总和;
|
||||
+ 统计每个用户过去几天,在不同类别文档(一级分类)上的总消费时长;
|
||||
+ 统计每个用户过去几天,在不同类别文档(二级分类)上的总消费时长;
|
||||
+ 统计每个用户过去几天,在不同类别文档(一级分类)上的平均消费时长;
|
||||
+ 统计每个用户过去几天,在不同类别文档(二级分类)上的平均消费时长;
|
||||
+ 统计每个用户过去几天,在不同类别文档(一级分类)上的消费时长的方差;
|
||||
+ 统计每个用户过去几天,在不同类别文档(二级分类)上的消费时长的方差;
|
||||
+ 文档特征统计:
|
||||
|
||||
+ 统计每篇文档在过去几天,被展示的总次数;
|
||||
+ 统计各类别(一级分类)文档在过去几天,被展示的总次数;
|
||||
+ 统计各类别(二级分类)文档在过去几天,被展示的总次数;
|
||||
+ 统计每篇文档在过去几天,平均的被点击率;
|
||||
+ 统计各类别(一级分类)文档在过去几天,平均被点击率;
|
||||
+ 统计各类别(二级分类)文档在过去几天,平均被点击率;
|
||||
+ 统计每篇文档在过去几天,总的被消费时长;
|
||||
+ 统计每篇文档在过去几天,平均的被消费时长;
|
||||
+ 统计每篇文档在过去几天,被消费时长的方差;
|
||||
+ 统计各类别(一级分类)文档在过去几天,总的被消费时长;
|
||||
+ 统计各类别(一级分类)文档在过去几天,平均的被消费时长;
|
||||
+ 统计各类别(一级分类)文档在过去几天,被消费时长的方差;
|
||||
+ 统计各类别(二级分类)文档在过去几天,总的被消费时长;
|
||||
+ 统计各类别(二级分类)文档在过去几天,平均的被消费时长;
|
||||
+ 统计各类别(二级分类)文档在过去几天,被消费时长的方差;
|
||||
|
||||
|
||||
### 2.2.3 特征归一化和编码
|
||||
|
||||
+ 连续型特征:
|
||||
+ 连续型特征包含的主要是统计特征,这里对于空值统一使用 $0$ 进行填充;
|
||||
+ 之后,对所有的连续型特征进行对数归一化, 即取 $log$ 对数;
|
||||
|
||||
+ 类别型特征:
|
||||
|
||||
+ 类别型特征这里主要是通过 $LabelEncoder$ 的方式进行编码,以便后续模型处理为相应的 $Embedding$;
|
||||
|
||||
|
||||
|
||||
# 3. 排序模型
|
||||
|
||||
排序模型的执行程序存放在`rank/examples/`下,分别为`deepfm_news.py`和`deepfm_ppnet_news.py`。
|
||||
|
||||
## 3.1 DeepFM
|
||||
|
||||
DeepFM是2017年由华为与哈工大提出的排序模型,,模型主要包含两部分:FM部分+Deep部分。
|
||||
|
||||
+ FM部分:对不同特征域的Embedding进行两两交叉,以加强模型在浅层网络中的特征组合能力。
|
||||
+ Deep部分:多层感知机网络模型。通过对特征各个维度进行充分的特征交叉组合,来学习到更多非线性以及组合特征的信息。
|
||||
|
||||
论文链接:[[DeepFM: A Factorization-Machine based Neural Network for CTR Prediction (arxiv.org)](https://arxiv.org/abs/1703.04247)
|
||||
|
||||
**实验结果**
|
||||
|
||||
1. 参数设置
|
||||
|
||||
```yaml
|
||||
# data para
|
||||
seed: 48
|
||||
# model para
|
||||
embed_dim: 32
|
||||
drop_rate: 0.5
|
||||
use_bn: Ture
|
||||
hidden_units: [64, 128, 64]
|
||||
# compile para
|
||||
learning_rate: 0.001
|
||||
epochs: 20
|
||||
batch_size: 2048
|
||||
val_splite: 0.1
|
||||
patience: 5
|
||||
restore_best_weights: True
|
||||
```
|
||||
|
||||
2. 运行结果
|
||||
|
||||
```bash
|
||||
Epoch 1/20
|
||||
2653/2653 [==============================] - 47s 17ms/step - loss: 0.3921 - auc: 0.7287 - val_loss: 0.3628 - val_auc: 0.7588
|
||||
Epoch 2/20
|
||||
2653/2653 [==============================] - 44s 17ms/step - loss: 0.3619 - auc: 0.7616 - val_loss: 0.3581 - val_auc: 0.7647
|
||||
Epoch 3/20
|
||||
2653/2653 [==============================] - 44s 17ms/step - loss: 0.3569 - auc: 0.7705 - val_loss: 0.3561 - val_auc: 0.7682
|
||||
Epoch 4/20
|
||||
2653/2653 [==============================] - 47s 18ms/step - loss: 0.3548 - auc: 0.7754 - val_loss: 0.3557 - val_auc: 0.7699
|
||||
Epoch 5/20
|
||||
2653/2653 [==============================] - 47s 18ms/step - loss: 0.3540 - auc: 0.7777 - val_loss: 0.3560 - val_auc: 0.7702
|
||||
Epoch 6/20
|
||||
2653/2653 [==============================] - 46s 18ms/step - loss: 0.3536 - auc: 0.7788 - val_loss: 0.3557 - val_auc: 0.7708
|
||||
Epoch 7/20
|
||||
2653/2653 [==============================] - 45s 17ms/step - loss: 0.3533 - auc: 0.7797 - val_loss: 0.3556 - val_auc: 0.7714
|
||||
Epoch 8/20
|
||||
2653/2653 [==============================] - 45s 17ms/step - loss: 0.3532 - auc: 0.7802 - val_loss: 0.3558 - val_auc: 0.7712
|
||||
Epoch 9/20
|
||||
2653/2653 [==============================] - 46s 17ms/step - loss: 0.3530 - auc: 0.7806 - val_loss: 0.3560 - val_auc: 0.7713
|
||||
Epoch 10/20
|
||||
2653/2653 [==============================] - 46s 17ms/step - loss: 0.3530 - auc: 0.7808 - val_loss: 0.3560 - val_auc: 0.7711
|
||||
Epoch 11/20
|
||||
2653/2653 [==============================] - 45s 17ms/step - loss: 0.3529 - auc: 0.7811 - val_loss: 0.3560 - val_auc: 0.7715
|
||||
Epoch 12/20
|
||||
2653/2653 [==============================] - 46s 17ms/step - loss: 0.3528 - auc: 0.7813 - val_loss: 0.3557 - val_auc: 0.7718
|
||||
251/251 [==============================] - 3s 11ms/step - loss: 0.3719 - auc: 0.7508
|
||||
test AUC: 0.750784
|
||||
```
|
||||
|
||||
## 3.2 DeepFM+PPNet
|
||||
|
||||
将DeepFM模型中,DNN 模块替换为PPNet模型:
|
||||
|
||||
+ 在语音识别领域中,2014 年和 2016 年提出的 LHUC 算法(learning hidden unit contributions)核心思想是做说话人自适应(speaker adaptation),其中一个关键突破是在 DNN 网络中,为每个说话人学习一个特定的隐式单位贡献(hidden unit contributions),来提升不同说话人的语音识别效果。
|
||||
+ 借鉴 LHUC 的思想,快手推荐团队在精排模型上展开了尝试。经过多次迭代优化,推荐团队设计出一种 gating 机制,可以增加 DNN 网络参数个性化并能够让模型快速收敛。快手把这种模型叫做 **PPNet(Parameter Personalized Net)**。
|
||||
|
||||
参考链接:[1.9万亿参数量,快手落地业界首个万亿参数推荐精排模型](https://mp.weixin.qq.com/s?__biz=MzA3MzI4MjgzMw==&idx=4&mid=2650808254&scene=21&sn=6c295c8306b7339858f8ecfadfc9d698#wechat_redirect)
|
||||
|
||||
**实验结果:**
|
||||
|
||||
1. 参数设置
|
||||
|
||||
```yaml
|
||||
# data para
|
||||
seed: 48
|
||||
# model para
|
||||
embed_dim: 32
|
||||
drop_rate: 0.5
|
||||
ppnet_size: 256
|
||||
ppnet_features: ['user_id', '一级分类', '年龄']
|
||||
use_bn: Ture
|
||||
hidden_units: [64, 128, 64]
|
||||
# compile para
|
||||
learning_rate: 0.001
|
||||
epochs: 20
|
||||
batch_size: 2048
|
||||
val_splite: 0.1
|
||||
patience: 5
|
||||
restore_best_weights: True
|
||||
```
|
||||
|
||||
2. 运行结果
|
||||
|
||||
```bash
|
||||
Epoch 1/20
|
||||
2653/2653 [==============================] - 56s 20ms/step - loss: 0.3929 - auc: 0.7303 - val_loss: 0.3648 - val_auc: 0.7568
|
||||
Epoch 2/20
|
||||
2653/2653 [==============================] - 53s 20ms/step - loss: 0.3620 - auc: 0.7622 - val_loss: 0.3591 - val_auc: 0.7651
|
||||
Epoch 3/20
|
||||
2653/2653 [==============================] - 55s 21ms/step - loss: 0.3578 - auc: 0.7706 - val_loss: 0.3580 - val_auc: 0.7690
|
||||
Epoch 4/20
|
||||
2653/2653 [==============================] - 53s 20ms/step - loss: 0.3560 - auc: 0.7755 - val_loss: 0.3587 - val_auc: 0.7701
|
||||
Epoch 5/20
|
||||
2653/2653 [==============================] - 54s 20ms/step - loss: 0.3551 - auc: 0.7787 - val_loss: 0.3580 - val_auc: 0.7706
|
||||
Epoch 6/20
|
||||
2653/2653 [==============================] - 55s 21ms/step - loss: 0.3545 - auc: 0.7809 - val_loss: 0.3587 - val_auc: 0.7718
|
||||
Epoch 7/20
|
||||
2653/2653 [==============================] - 54s 20ms/step - loss: 0.3541 - auc: 0.7829 - val_loss: 0.3586 - val_auc: 0.7720
|
||||
Epoch 8/20
|
||||
2653/2653 [==============================] - 53s 20ms/step - loss: 0.3538 - auc: 0.7842 - val_loss: 0.3587 - val_auc: 0.7721
|
||||
251/251 [==============================] - 4s 13ms/step - loss: 0.3686 - auc: 0.7543
|
||||
test AUC: 0.754304
|
||||
```
|
||||
|
||||
|
||||
|
||||
# 4. 程序执行
|
||||
|
||||
```bash
|
||||
# 数据预处理
|
||||
1. user&doc_data_process.py
|
||||
2. train&test_data_split.py
|
||||
3. news_data_process.py
|
||||
|
||||
# 排序模型
|
||||
4. deepfm_news.py 或 deepfm_ppnet_news.py
|
||||
```
|
||||
|
||||
|
||||
|
||||
# Requirements
|
||||
|
||||
- Tensorflow2.5 (GPU)
|
||||
- Numpy
|
||||
- Pandas
|
||||
- Swifter
|
||||
- Sklearn
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,39 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
# @File : run_deepfm.py
|
||||
# @Author: xLyons
|
||||
# @IDE : PyCharm
|
||||
# @Time : 2022/1/27
|
||||
|
||||
|
||||
from tensorflow.keras.callbacks import EarlyStopping
|
||||
from tensorflow.keras.optimizers import Adam
|
||||
from tensorflow.keras.metrics import AUC
|
||||
|
||||
from models.deepfm import DeepFM
|
||||
|
||||
|
||||
def run(train_data, test_data, feature_columns, args):
|
||||
# 1. 建模
|
||||
model = DeepFM(feature_columns=feature_columns,
|
||||
dnn_hidden_units=args.hidden_units,
|
||||
dnn_drop_rate=args.drop_rate,
|
||||
dnn_use_bn=args.use_bn)
|
||||
# 2. 编译
|
||||
model.compile(optimizer=Adam(learning_rate=args.learning_rate),
|
||||
loss="binary_crossentropy",
|
||||
metrics=[AUC()])
|
||||
model.summary()
|
||||
# 3. 训练
|
||||
model.fit(train_data[0],
|
||||
train_data[1],
|
||||
batch_size=args.batch_size,
|
||||
epochs=args.epochs,
|
||||
callbacks=[EarlyStopping(monitor='val_loss',
|
||||
patience=args.patience,
|
||||
mode='min',
|
||||
restore_best_weights=args.restore_best_weights)],
|
||||
validation_split=args.val_splite,
|
||||
)
|
||||
# 4. 测试
|
||||
print('test AUC: %f' % model.evaluate(test_data[0], test_data[1], batch_size=args.batch_size)[1])
|
||||
@@ -0,0 +1,42 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
# @File : run_deepfm_ppnet.py
|
||||
# @Author: xLyons
|
||||
# @IDE : PyCharm
|
||||
# @Time : 2022/2/8
|
||||
|
||||
|
||||
from tensorflow.keras.callbacks import EarlyStopping
|
||||
from tensorflow.keras.optimizers import Adam
|
||||
from tensorflow.keras.metrics import AUC
|
||||
|
||||
from models.deepfm_ppnet import DeepFM_PPNet
|
||||
|
||||
|
||||
def run(train_data, test_data, feature_columns, args):
|
||||
# 1. 建模
|
||||
model = DeepFM_PPNet(
|
||||
feature_columns=feature_columns,
|
||||
ppnet_size=args.ppnet_size,
|
||||
ppnet_features=args.ppnet_features,
|
||||
dnn_hidden_units=args.hidden_units,
|
||||
dnn_drop_rate=args.drop_rate,
|
||||
dnn_use_bn=args.use_bn)
|
||||
# 2. 编译
|
||||
model.compile(optimizer=Adam(learning_rate=args.learning_rate),
|
||||
loss="binary_crossentropy",
|
||||
metrics=[AUC()])
|
||||
model.summary()
|
||||
# 3. 训练
|
||||
model.fit(train_data[0],
|
||||
train_data[1],
|
||||
batch_size=args.batch_size,
|
||||
epochs=args.epochs,
|
||||
callbacks=[EarlyStopping(monitor='val_loss',
|
||||
patience=args.patience,
|
||||
mode='min',
|
||||
restore_best_weights=args.restore_best_weights)],
|
||||
validation_split=args.val_splite,
|
||||
)
|
||||
# 4. 测试
|
||||
print('test AUC: %f' % model.evaluate(test_data[0], test_data[1], batch_size=args.batch_size)[1])
|
||||
Binary file not shown.
@@ -0,0 +1,44 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
# @File : data_compression.py
|
||||
# @Author: xLyons
|
||||
# @IDE : PyCharm
|
||||
# @Time : 2022/2/7
|
||||
|
||||
|
||||
import gc
|
||||
import numpy as np
|
||||
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
|
||||
def reduce_mem(df):
|
||||
start_mem = df.memory_usage().sum() / 1024 ** 2
|
||||
print(f'开始进行内存压缩...')
|
||||
for col in tqdm(df.columns):
|
||||
col_type = df[col].dtypes
|
||||
if col_type != object:
|
||||
c_min = df[col].min()
|
||||
c_max = df[col].max()
|
||||
if str(col_type)[:3] == 'int':
|
||||
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
|
||||
df[col] = df[col].astype(np.int8)
|
||||
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
|
||||
df[col] = df[col].astype(np.int16)
|
||||
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
|
||||
df[col] = df[col].astype(np.int32)
|
||||
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
|
||||
df[col] = df[col].astype(np.int64)
|
||||
elif str(col_type)[:5] == 'float':
|
||||
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
|
||||
df[col] = df[col].astype(np.float16)
|
||||
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
|
||||
df[col] = df[col].astype(np.float32)
|
||||
else:
|
||||
df[col] = df[col].astype(np.float64)
|
||||
|
||||
end_mem = df.memory_usage().sum() / 1024 ** 2
|
||||
print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
|
||||
gc.collect()
|
||||
|
||||
return df
|
||||
@@ -0,0 +1,16 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
# @file : set_device.py
|
||||
# @Author: xLyons
|
||||
# @IDE : PyCharm
|
||||
# @Time : 2022/2/7
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
def set_GPU():
|
||||
gpus = tf.config.experimental.list_physical_devices('GPU')
|
||||
print(gpus)
|
||||
|
||||
for gpu in gpus:
|
||||
tf.config.experimental.set_memory_growth(gpu, True)
|
||||
@@ -0,0 +1,20 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
# @file : set_parament.py
|
||||
# @Author: xLyons
|
||||
# @IDE : PyCharm
|
||||
# @Time : 2022/2/7
|
||||
|
||||
import yaml
|
||||
from collections import namedtuple
|
||||
|
||||
|
||||
def get_args(yaml_path):
|
||||
with open(yaml_path, 'r', encoding='utf-8') as f:
|
||||
para_dict = yaml.load(f.read(), Loader=yaml.FullLoader)
|
||||
|
||||
ps = namedtuple('parser', list(para_dict.keys()))
|
||||
args = ps(**para_dict)
|
||||
f.close()
|
||||
|
||||
return args
|
||||
Reference in New Issue
Block a user