292 lines
14 KiB
Python
292 lines
14 KiB
Python
from random import sample, seed
|
||
import sys
|
||
sys.path.append("..")
|
||
import random
|
||
import numpy as np
|
||
import pandas as pd
|
||
from datetime import date, datetime
|
||
from sklearn.preprocessing import LabelEncoder
|
||
from tqdm import tqdm
|
||
import tensorflow as tf
|
||
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
|
||
|
||
def process_data(sample_num=5000000):
|
||
train_data_path = "./data/train"
|
||
print("read train data ...")
|
||
train_data_df = pd.read_csv(train_data_path, sep=',', nrows=sample_num)
|
||
|
||
all_df = train_data_df
|
||
all_df['hour'] = all_df['hour'].astype(str)
|
||
|
||
# 构造时间相关的特征
|
||
def _convert_weekday(timestamp):
|
||
dt = date(int('20' + timestamp[0:2]), int(timestamp[2:4]), int(timestamp[4:6]))
|
||
return int(dt.strftime('%w'))
|
||
|
||
def _convert_weekend(timestamp):
|
||
dt = date(int('20' + timestamp[0:2]), int(timestamp[2:4]), int(timestamp[4:6]))
|
||
return 1 if dt.strftime('%w') in ['6', '0'] else 0
|
||
|
||
"""
|
||
is_weekend: 是否是周末
|
||
weekday: 星期几
|
||
hour: 几点
|
||
"""
|
||
all_df['is_weekend'] = all_df['hour'].apply(lambda x: _convert_weekend(x))
|
||
all_df['weekday'] = all_df['hour'].apply(lambda x: _convert_weekday(x))
|
||
all_df['hour_v2'] = all_df['hour'].apply(lambda x: int(x[6:8]))
|
||
del all_df['hour']
|
||
|
||
sparse_features = ['id', 'C1', 'banner_pos', 'site_id', 'site_domain',
|
||
'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
|
||
'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
|
||
'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'is_weekend',
|
||
'weekday', 'hour_v2']
|
||
|
||
print("start label encode ... ")
|
||
feature_max_index_dict = {}
|
||
for feat in sparse_features:
|
||
lbe = LabelEncoder()
|
||
all_df[feat] = lbe.fit_transform(all_df[feat]) + 1 # 让id从1开始,0可能会被做掩码
|
||
feature_max_index_dict[feat] = all_df[feat].max() + 1
|
||
|
||
train_df = all_df
|
||
feature_names = train_df.columns
|
||
train_input_dict = {}
|
||
for name in feature_names:
|
||
train_input_dict[name] = np.array(train_df[name].values)
|
||
|
||
train_label = np.array(train_df['click'])
|
||
train_df.pop('click')
|
||
return feature_max_index_dict, train_input_dict, train_label
|
||
|
||
|
||
def gen_sdm_data_set(click_data, seq_short_len=5, seq_prefer_len=50):
|
||
"""
|
||
:param: seq_short_len: 短期会话的长度
|
||
:param: seq_prefer_len: 会话的最长长度
|
||
"""
|
||
click_data.sort_values("expo_time", inplace=True)
|
||
|
||
train_set, test_set = [], []
|
||
for user_id, hist_click in tqdm(click_data.groupby('user_id')):
|
||
pos_list = hist_click['article_id'].tolist()
|
||
cat1_list = hist_click['cat_1'].tolist()
|
||
cat2_list = hist_click['cat_2'].tolist()
|
||
|
||
# 滑动窗口切分数据
|
||
for i in range(1, len(pos_list)):
|
||
hist = pos_list[:i]
|
||
cat1_hist = cat1_list[:i]
|
||
cat2_hist = cat2_list[:i]
|
||
# 序列长度只够短期的
|
||
if i <= seq_short_len and i != len(pos_list) - 1:
|
||
train_set.append((
|
||
# 用户id, 用户短期历史行为序列, 用户长期历史行为序列, 当前行为文章, label,
|
||
user_id, hist[::-1], [0] * seq_prefer_len, pos_list[i], 1,
|
||
# 用户短期历史序列长度, 用户长期历史序列长度,
|
||
len(hist[::-1]), 0,
|
||
# 用户短期历史序列对应类别1, 用户长期历史行为序列对应类别1
|
||
cat1_hist[::-1], [0] * seq_prefer_len,
|
||
# 历史短期历史序列对应类别2, 用户长期历史行为序列对应类别2
|
||
cat2_hist[::-1], [0] * seq_prefer_len
|
||
))
|
||
# 序列长度够长期的
|
||
elif i != len(pos_list) - 1:
|
||
train_set.append((
|
||
# 用户id, 用户短期历史行为序列,用户长期历史行为序列, 当前行为文章, label
|
||
user_id, hist[::-1][:seq_short_len], hist[::-1][seq_short_len:], pos_list[i], 1,
|
||
# 用户短期行为序列长度,用户长期行为序列长度,
|
||
seq_short_len, len(hist[::-1]) - seq_short_len,
|
||
# 用户短期历史行为序列对应类别1, 用户长期历史行为序列对应类别1
|
||
cat1_hist[::-1][:seq_short_len], cat1_hist[::-1][seq_short_len:],
|
||
# 用户短期历史行为序列对应类别2, 用户长期历史行为序列对应类别2
|
||
cat2_hist[::-1][:seq_short_len], cat2_hist[::-1][seq_short_len:]
|
||
))
|
||
# 测试集保留最长的那一条
|
||
elif i <= seq_short_len and i == len(pos_list) - 1:
|
||
test_set.append((
|
||
user_id, hist[::-1], [0] * seq_prefer_len, pos_list[i], 1,
|
||
len(hist[::-1]), 0,
|
||
cat1_hist[::-1], [0] * seq_perfer_len,
|
||
cat2_hist[::-1], [0] * seq_prefer_len
|
||
))
|
||
else:
|
||
test_set.append((
|
||
user_id, hist[::-1][:seq_short_len], hist[::-1][seq_short_len:], pos_list[i], 1,
|
||
seq_short_len, len(hist[::-1]) - seq_short_len,
|
||
cat1_hist[::-1][:seq_short_len], cat1_hist[::-1][seq_short_len:],
|
||
cat2_list[::-1][:seq_short_len], cat2_hist[::-1][seq_short_len:]
|
||
))
|
||
|
||
random.shuffle(train_set)
|
||
random.shuffle(test_set)
|
||
|
||
return train_set, test_set
|
||
|
||
|
||
def gen_sdm_model_input(train_set, user_profile, seq_short_len, seq_prefer_len):
|
||
"""构造模型输入"""
|
||
# row: [user_id, short_train_seq, perfer_train_seq, item_id, label, short_len, perfer_len, cat_1_short, cat_1_perfer, cat_2_short, cat_2_prefer]
|
||
train_uid = np.array([row[0] for row in train_set])
|
||
short_train_seq = [row[1] for row in train_set]
|
||
prefer_train_seq = [row[2] for row in train_set]
|
||
train_iid = np.array([row[3] for row in train_set])
|
||
train_label = np.array([row[4] for row in train_set])
|
||
train_short_len = np.array([row[5] for row in train_set])
|
||
train_prefer_len = np.array([row[6] for row in train_set])
|
||
short_train_seq_cat1 = np.array([row[7] for row in train_set])
|
||
prefer_train_seq_cat1 = np.array([row[8] for row in train_set])
|
||
short_train_seq_cat2 = np.array([row[9] for row in train_set])
|
||
prefer_train_seq_cat2 = np.array([row[10] for row in train_set])
|
||
|
||
# padding操作
|
||
train_short_item_pad = pad_sequences(short_train_seq, maxlen=seq_short_len, padding='post', truncating='post',
|
||
value=0)
|
||
train_prefer_item_pad = pad_sequences(prefer_train_seq, maxlen=seq_prefer_len, padding='post', truncating='post',
|
||
value=0)
|
||
train_short_cat1_pad = pad_sequences(short_train_seq_cat1, maxlen=seq_short_len, padding='post', truncating='post',
|
||
value=0)
|
||
train_prefer_cat1_pad = pad_sequences(prefer_train_seq_cat1, maxlen=seq_prefer_len, padding='post',
|
||
truncating='post', value=0)
|
||
train_short_cat2_pad = pad_sequences(short_train_seq_cat2, maxlen=seq_short_len, padding='post', truncating='post',
|
||
value=0)
|
||
train_prefer_cat2_pad = pad_sequences(prefer_train_seq_cat2, maxlen=seq_prefer_len, padding='post',
|
||
truncating='post', value=0)
|
||
|
||
# 形成输入词典
|
||
train_model_input = {
|
||
"user_id": train_uid,
|
||
"doc_id": train_iid,
|
||
"short_doc_id": train_short_item_pad,
|
||
"prefer_doc_id": train_prefer_item_pad,
|
||
"prefer_sess_length": train_prefer_len,
|
||
"short_sess_length": train_short_len,
|
||
"short_cat1": train_short_cat1_pad,
|
||
"prefer_cat1": train_prefer_cat1_pad,
|
||
"short_cat2": train_short_cat2_pad,
|
||
"prefer_cat2": train_prefer_cat2_pad
|
||
}
|
||
|
||
# 其他的用户特征加入
|
||
for key in ["gender", "age", "city"]:
|
||
train_model_input[key] = user_profile.loc[train_model_input['user_id']][key].values
|
||
|
||
return train_model_input, train_label
|
||
|
||
def gen_neg_sample_candiate(pos_list, item_ids, doc_clicked_count_dict, negsample, methods='multinomial'):
|
||
# w2v的负样本采样, 增加高热item成为负样本的概率
|
||
# 参考https://blog.csdn.net/weixin_42299244/article/details/112734531, 这里用tf相应函数替换
|
||
# tf.random.categoral 非相关数据的降采样
|
||
if methods == 'multinomial':
|
||
# input表示每个item出现的次数
|
||
items, item_counts = [], []
|
||
# 用户未点击的item 这个遍历效率很低 后面看看能不能优化下
|
||
# for item_id in list(set(item_ids)-set(pos_list)):
|
||
# items.append(item_id)
|
||
# item_counts.append(doc_clicked_count_dict[item_id]*1.0)
|
||
|
||
items = list(doc_clicked_count_dict.keys())
|
||
item_counts = list(doc_clicked_count_dict.values())
|
||
item_freqs = np.array(item_counts) / np.sum(item_counts)
|
||
item_freqs = item_freqs ** 0.75
|
||
item_freqs = item_freqs / np.sum(item_freqs)
|
||
neg_item_index = tf.random.categorical(item_freqs.reshape(1, -1), len(pos_list) * negsample)
|
||
neg_list = np.array([items[i] for i in neg_item_index.numpy().tolist()[0] if items[i] not in pos_list])
|
||
random.shuffle(neg_list)
|
||
# 曝光样本随机采
|
||
else:
|
||
candidate_set = list(set(item_ids) - set(pos_list)) # 热度采样
|
||
neg_list = np.random.choice(candidate_set, size=len(pos_list) * negsample, replace=True) # 对于每个正样本,选择n个负样本
|
||
|
||
return neg_list
|
||
|
||
def gen_data_set(click_data, doc_clicked_count_dict, negsample, control_users=False):
|
||
"""构造youtubeDNN的数据集"""
|
||
# 按照曝光时间排序
|
||
click_data.sort_values("expo_time", inplace=True)
|
||
item_ids = click_data['article_id'].unique()
|
||
|
||
train_set, test_set = [], []
|
||
for user_id, hist_click in tqdm(click_data.groupby('user_id')):
|
||
# 这里按照expo_date分开,每一天用滑动窗口滑,可能相关性更高些,另外,这样序列不会太长,因为eda发现有点击1111个的
|
||
# for expo_date, hist_click in hist_date_click.groupby('expo_date'):
|
||
# 用户当天的点击历史id
|
||
pos_list = hist_click['article_id'].tolist()
|
||
user_control_flag = True
|
||
|
||
if control_users:
|
||
user_samples_cou = 0
|
||
|
||
# 过长的序列截断
|
||
if len(pos_list) > 50:
|
||
pos_list = pos_list[-50:]
|
||
|
||
if negsample > 0:
|
||
neg_list = gen_neg_sample_candiate(pos_list, item_ids, doc_clicked_count_dict, negsample,
|
||
methods='random')
|
||
|
||
# 只有1个的也截断 去掉,当然我之前做了处理,这里没有这种情况了
|
||
if len(pos_list) < 2:
|
||
continue
|
||
else:
|
||
# 序列至少是2
|
||
for i in range(1, len(pos_list)):
|
||
hist = pos_list[:i]
|
||
# 这里采用打压热门item策略,降低高展item成为正样本的概率
|
||
#freq_i = doc_clicked_count_dict[pos_list[i]] / (np.sum(list(doc_clicked_count_dict.values())))
|
||
#p_posi = (np.sqrt(freq_i / 0.001) + 1) * (0.001 / freq_i)
|
||
|
||
# p_posi=0.3 表示该item_i成为正样本的概率是0.3,
|
||
if user_control_flag and i != len(pos_list) - 1:
|
||
#if random.random() > (1 - p_posi):
|
||
if True:
|
||
row = [user_id, hist[::-1], pos_list[i], hist_click.iloc[0]['city'], hist_click.iloc[0]['age'],
|
||
hist_click.iloc[0]['gender'], 1, len(hist[::-1])]
|
||
train_set.append(row)
|
||
|
||
for negi in range(negsample):
|
||
row = [user_id, hist[::-1], neg_list[i * negsample + negi], hist_click.iloc[0]['city'],
|
||
hist_click.iloc[0]['age'], hist_click.iloc[0]['gender'], 0, len(hist[::-1])]
|
||
train_set.append(row)
|
||
|
||
if control_users:
|
||
user_samples_cou += 1
|
||
# 每个用户序列最长是50, 即每个用户正样本个数最多是50个, 如果每个用户训练样本数量到了30个,训练集不能加这个用户了
|
||
if user_samples_cou > 30:
|
||
user_samples_cou = False
|
||
|
||
# 整个序列加入到test_set, 注意,这里一定每个用户只有一个最长序列,相当于测试集数目等于用户个数
|
||
elif i == len(pos_list) - 1:
|
||
row = [user_id, hist[::-1], pos_list[i], hist_click.iloc[0]['city'], hist_click.iloc[0]['age'],
|
||
hist_click.iloc[0]['gender'], 0, len(hist[::-1])]
|
||
test_set.append(row)
|
||
|
||
random.shuffle(train_set)
|
||
random.shuffle(test_set)
|
||
return train_set, test_set
|
||
|
||
"""构造模型输入"""
|
||
def gen_model_input(train_set, his_seq_max_len):
|
||
"""构造模型的输入"""
|
||
# row: [user_id, hist_list, cur_doc_id, city, age, gender, label, hist_len]
|
||
train_uid = np.array([row[0] for row in train_set])
|
||
train_hist_seq = [row[1] for row in train_set]
|
||
train_iid = np.array([row[2] for row in train_set])
|
||
train_u_city = np.array([row[3] for row in train_set])
|
||
train_u_age = np.array([row[4] for row in train_set])
|
||
train_u_gender = np.array([row[5] for row in train_set])
|
||
train_label = np.array([row[6] for row in train_set])
|
||
train_hist_len = np.array([row[7] for row in train_set])
|
||
|
||
train_seq_pad = pad_sequences(train_hist_seq, maxlen=his_seq_max_len, padding='post', truncating='post', value=0)
|
||
train_model_input = {
|
||
"user_id": train_uid,
|
||
"doc_id": train_iid,
|
||
"hist_doc_id": train_seq_pad,
|
||
"hist_len": train_hist_len,
|
||
"u_city": train_u_city,
|
||
"u_age": train_u_age,
|
||
"u_gender": train_u_gender,
|
||
}
|
||
return train_model_input, train_label |