Files
fun-rec/codes/funrec/examples/preprocess.py
zhongqiangwu960812 2ea75605ab upload sdm_mind demo
2022-04-03 13:35:30 +08:00

292 lines
14 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from random import sample, seed
import sys
sys.path.append("..")
import random
import numpy as np
import pandas as pd
from datetime import date, datetime
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import tensorflow as tf
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
def process_data(sample_num=5000000):
train_data_path = "./data/train"
print("read train data ...")
train_data_df = pd.read_csv(train_data_path, sep=',', nrows=sample_num)
all_df = train_data_df
all_df['hour'] = all_df['hour'].astype(str)
# 构造时间相关的特征
def _convert_weekday(timestamp):
dt = date(int('20' + timestamp[0:2]), int(timestamp[2:4]), int(timestamp[4:6]))
return int(dt.strftime('%w'))
def _convert_weekend(timestamp):
dt = date(int('20' + timestamp[0:2]), int(timestamp[2:4]), int(timestamp[4:6]))
return 1 if dt.strftime('%w') in ['6', '0'] else 0
"""
is_weekend: 是否是周末
weekday: 星期几
hour: 几点
"""
all_df['is_weekend'] = all_df['hour'].apply(lambda x: _convert_weekend(x))
all_df['weekday'] = all_df['hour'].apply(lambda x: _convert_weekday(x))
all_df['hour_v2'] = all_df['hour'].apply(lambda x: int(x[6:8]))
del all_df['hour']
sparse_features = ['id', 'C1', 'banner_pos', 'site_id', 'site_domain',
'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'is_weekend',
'weekday', 'hour_v2']
print("start label encode ... ")
feature_max_index_dict = {}
for feat in sparse_features:
lbe = LabelEncoder()
all_df[feat] = lbe.fit_transform(all_df[feat]) + 1 # 让id从1开始0可能会被做掩码
feature_max_index_dict[feat] = all_df[feat].max() + 1
train_df = all_df
feature_names = train_df.columns
train_input_dict = {}
for name in feature_names:
train_input_dict[name] = np.array(train_df[name].values)
train_label = np.array(train_df['click'])
train_df.pop('click')
return feature_max_index_dict, train_input_dict, train_label
def gen_sdm_data_set(click_data, seq_short_len=5, seq_prefer_len=50):
"""
:param: seq_short_len: 短期会话的长度
:param: seq_prefer_len: 会话的最长长度
"""
click_data.sort_values("expo_time", inplace=True)
train_set, test_set = [], []
for user_id, hist_click in tqdm(click_data.groupby('user_id')):
pos_list = hist_click['article_id'].tolist()
cat1_list = hist_click['cat_1'].tolist()
cat2_list = hist_click['cat_2'].tolist()
# 滑动窗口切分数据
for i in range(1, len(pos_list)):
hist = pos_list[:i]
cat1_hist = cat1_list[:i]
cat2_hist = cat2_list[:i]
# 序列长度只够短期的
if i <= seq_short_len and i != len(pos_list) - 1:
train_set.append((
# 用户id, 用户短期历史行为序列, 用户长期历史行为序列, 当前行为文章, label
user_id, hist[::-1], [0] * seq_prefer_len, pos_list[i], 1,
# 用户短期历史序列长度, 用户长期历史序列长度,
len(hist[::-1]), 0,
# 用户短期历史序列对应类别1 用户长期历史行为序列对应类别1
cat1_hist[::-1], [0] * seq_prefer_len,
# 历史短期历史序列对应类别2 用户长期历史行为序列对应类别2
cat2_hist[::-1], [0] * seq_prefer_len
))
# 序列长度够长期的
elif i != len(pos_list) - 1:
train_set.append((
# 用户id, 用户短期历史行为序列,用户长期历史行为序列, 当前行为文章, label
user_id, hist[::-1][:seq_short_len], hist[::-1][seq_short_len:], pos_list[i], 1,
# 用户短期行为序列长度,用户长期行为序列长度,
seq_short_len, len(hist[::-1]) - seq_short_len,
# 用户短期历史行为序列对应类别1 用户长期历史行为序列对应类别1
cat1_hist[::-1][:seq_short_len], cat1_hist[::-1][seq_short_len:],
# 用户短期历史行为序列对应类别2 用户长期历史行为序列对应类别2
cat2_hist[::-1][:seq_short_len], cat2_hist[::-1][seq_short_len:]
))
# 测试集保留最长的那一条
elif i <= seq_short_len and i == len(pos_list) - 1:
test_set.append((
user_id, hist[::-1], [0] * seq_prefer_len, pos_list[i], 1,
len(hist[::-1]), 0,
cat1_hist[::-1], [0] * seq_perfer_len,
cat2_hist[::-1], [0] * seq_prefer_len
))
else:
test_set.append((
user_id, hist[::-1][:seq_short_len], hist[::-1][seq_short_len:], pos_list[i], 1,
seq_short_len, len(hist[::-1]) - seq_short_len,
cat1_hist[::-1][:seq_short_len], cat1_hist[::-1][seq_short_len:],
cat2_list[::-1][:seq_short_len], cat2_hist[::-1][seq_short_len:]
))
random.shuffle(train_set)
random.shuffle(test_set)
return train_set, test_set
def gen_sdm_model_input(train_set, user_profile, seq_short_len, seq_prefer_len):
"""构造模型输入"""
# row: [user_id, short_train_seq, perfer_train_seq, item_id, label, short_len, perfer_len, cat_1_short, cat_1_perfer, cat_2_short, cat_2_prefer]
train_uid = np.array([row[0] for row in train_set])
short_train_seq = [row[1] for row in train_set]
prefer_train_seq = [row[2] for row in train_set]
train_iid = np.array([row[3] for row in train_set])
train_label = np.array([row[4] for row in train_set])
train_short_len = np.array([row[5] for row in train_set])
train_prefer_len = np.array([row[6] for row in train_set])
short_train_seq_cat1 = np.array([row[7] for row in train_set])
prefer_train_seq_cat1 = np.array([row[8] for row in train_set])
short_train_seq_cat2 = np.array([row[9] for row in train_set])
prefer_train_seq_cat2 = np.array([row[10] for row in train_set])
# padding操作
train_short_item_pad = pad_sequences(short_train_seq, maxlen=seq_short_len, padding='post', truncating='post',
value=0)
train_prefer_item_pad = pad_sequences(prefer_train_seq, maxlen=seq_prefer_len, padding='post', truncating='post',
value=0)
train_short_cat1_pad = pad_sequences(short_train_seq_cat1, maxlen=seq_short_len, padding='post', truncating='post',
value=0)
train_prefer_cat1_pad = pad_sequences(prefer_train_seq_cat1, maxlen=seq_prefer_len, padding='post',
truncating='post', value=0)
train_short_cat2_pad = pad_sequences(short_train_seq_cat2, maxlen=seq_short_len, padding='post', truncating='post',
value=0)
train_prefer_cat2_pad = pad_sequences(prefer_train_seq_cat2, maxlen=seq_prefer_len, padding='post',
truncating='post', value=0)
# 形成输入词典
train_model_input = {
"user_id": train_uid,
"doc_id": train_iid,
"short_doc_id": train_short_item_pad,
"prefer_doc_id": train_prefer_item_pad,
"prefer_sess_length": train_prefer_len,
"short_sess_length": train_short_len,
"short_cat1": train_short_cat1_pad,
"prefer_cat1": train_prefer_cat1_pad,
"short_cat2": train_short_cat2_pad,
"prefer_cat2": train_prefer_cat2_pad
}
# 其他的用户特征加入
for key in ["gender", "age", "city"]:
train_model_input[key] = user_profile.loc[train_model_input['user_id']][key].values
return train_model_input, train_label
def gen_neg_sample_candiate(pos_list, item_ids, doc_clicked_count_dict, negsample, methods='multinomial'):
# w2v的负样本采样 增加高热item成为负样本的概率
# 参考https://blog.csdn.net/weixin_42299244/article/details/112734531, 这里用tf相应函数替换
# tf.random.categoral 非相关数据的降采样
if methods == 'multinomial':
# input表示每个item出现的次数
items, item_counts = [], []
# 用户未点击的item 这个遍历效率很低 后面看看能不能优化下
# for item_id in list(set(item_ids)-set(pos_list)):
# items.append(item_id)
# item_counts.append(doc_clicked_count_dict[item_id]*1.0)
items = list(doc_clicked_count_dict.keys())
item_counts = list(doc_clicked_count_dict.values())
item_freqs = np.array(item_counts) / np.sum(item_counts)
item_freqs = item_freqs ** 0.75
item_freqs = item_freqs / np.sum(item_freqs)
neg_item_index = tf.random.categorical(item_freqs.reshape(1, -1), len(pos_list) * negsample)
neg_list = np.array([items[i] for i in neg_item_index.numpy().tolist()[0] if items[i] not in pos_list])
random.shuffle(neg_list)
# 曝光样本随机采
else:
candidate_set = list(set(item_ids) - set(pos_list)) # 热度采样
neg_list = np.random.choice(candidate_set, size=len(pos_list) * negsample, replace=True) # 对于每个正样本选择n个负样本
return neg_list
def gen_data_set(click_data, doc_clicked_count_dict, negsample, control_users=False):
"""构造youtubeDNN的数据集"""
# 按照曝光时间排序
click_data.sort_values("expo_time", inplace=True)
item_ids = click_data['article_id'].unique()
train_set, test_set = [], []
for user_id, hist_click in tqdm(click_data.groupby('user_id')):
# 这里按照expo_date分开每一天用滑动窗口滑可能相关性更高些,另外这样序列不会太长因为eda发现有点击1111个的
# for expo_date, hist_click in hist_date_click.groupby('expo_date'):
# 用户当天的点击历史id
pos_list = hist_click['article_id'].tolist()
user_control_flag = True
if control_users:
user_samples_cou = 0
# 过长的序列截断
if len(pos_list) > 50:
pos_list = pos_list[-50:]
if negsample > 0:
neg_list = gen_neg_sample_candiate(pos_list, item_ids, doc_clicked_count_dict, negsample,
methods='random')
# 只有1个的也截断 去掉,当然我之前做了处理,这里没有这种情况了
if len(pos_list) < 2:
continue
else:
# 序列至少是2
for i in range(1, len(pos_list)):
hist = pos_list[:i]
# 这里采用打压热门item策略降低高展item成为正样本的概率
#freq_i = doc_clicked_count_dict[pos_list[i]] / (np.sum(list(doc_clicked_count_dict.values())))
#p_posi = (np.sqrt(freq_i / 0.001) + 1) * (0.001 / freq_i)
# p_posi=0.3 表示该item_i成为正样本的概率是0.3
if user_control_flag and i != len(pos_list) - 1:
#if random.random() > (1 - p_posi):
if True:
row = [user_id, hist[::-1], pos_list[i], hist_click.iloc[0]['city'], hist_click.iloc[0]['age'],
hist_click.iloc[0]['gender'], 1, len(hist[::-1])]
train_set.append(row)
for negi in range(negsample):
row = [user_id, hist[::-1], neg_list[i * negsample + negi], hist_click.iloc[0]['city'],
hist_click.iloc[0]['age'], hist_click.iloc[0]['gender'], 0, len(hist[::-1])]
train_set.append(row)
if control_users:
user_samples_cou += 1
# 每个用户序列最长是50 即每个用户正样本个数最多是50个, 如果每个用户训练样本数量到了30个训练集不能加这个用户了
if user_samples_cou > 30:
user_samples_cou = False
# 整个序列加入到test_set 注意,这里一定每个用户只有一个最长序列,相当于测试集数目等于用户个数
elif i == len(pos_list) - 1:
row = [user_id, hist[::-1], pos_list[i], hist_click.iloc[0]['city'], hist_click.iloc[0]['age'],
hist_click.iloc[0]['gender'], 0, len(hist[::-1])]
test_set.append(row)
random.shuffle(train_set)
random.shuffle(test_set)
return train_set, test_set
"""构造模型输入"""
def gen_model_input(train_set, his_seq_max_len):
"""构造模型的输入"""
# row: [user_id, hist_list, cur_doc_id, city, age, gender, label, hist_len]
train_uid = np.array([row[0] for row in train_set])
train_hist_seq = [row[1] for row in train_set]
train_iid = np.array([row[2] for row in train_set])
train_u_city = np.array([row[3] for row in train_set])
train_u_age = np.array([row[4] for row in train_set])
train_u_gender = np.array([row[5] for row in train_set])
train_label = np.array([row[6] for row in train_set])
train_hist_len = np.array([row[7] for row in train_set])
train_seq_pad = pad_sequences(train_hist_seq, maxlen=his_seq_max_len, padding='post', truncating='post', value=0)
train_model_input = {
"user_id": train_uid,
"doc_id": train_iid,
"hist_doc_id": train_seq_pad,
"hist_len": train_hist_len,
"u_city": train_u_city,
"u_age": train_u_age,
"u_gender": train_u_gender,
}
return train_model_input, train_label