Files
fun-rec/codes/funrec/examples/preprocess.py
2022-03-28 22:16:08 +08:00

62 lines
2.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from random import sample, seed
import sys
sys.path.append("..")
import os
import time
import numpy as np
import pandas as pd
from datetime import date, datetime
from sklearn.preprocessing import LabelEncoder
from features import DenseFeat, SparseFeat, VarLenSparseFeat
def process_data(sample_num=5000000):
train_data_path = "./data/train"
print("read train data ...")
train_data_df = pd.read_csv(train_data_path, sep=',', nrows=sample_num)
all_df = train_data_df
all_df['hour'] = all_df['hour'].astype(str)
# 构造时间相关的特征
def _convert_weekday(timestamp):
dt = date(int('20' + timestamp[0:2]), int(timestamp[2:4]), int(timestamp[4:6]))
return int(dt.strftime('%w'))
def _convert_weekend(timestamp):
dt = date(int('20' + timestamp[0:2]), int(timestamp[2:4]), int(timestamp[4:6]))
return 1 if dt.strftime('%w') in ['6', '0'] else 0
"""
is_weekend: 是否是周末
weekday: 星期几
hour: 几点
"""
all_df['is_weekend'] = all_df['hour'].apply(lambda x: _convert_weekend(x))
all_df['weekday'] = all_df['hour'].apply(lambda x: _convert_weekday(x))
all_df['hour_v2'] = all_df['hour'].apply(lambda x: int(x[6:8]))
del all_df['hour']
sparse_features = ['id', 'C1', 'banner_pos', 'site_id', 'site_domain',
'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'is_weekend',
'weekday', 'hour_v2']
print("start label encode ... ")
feature_max_index_dict = {}
for feat in sparse_features:
lbe = LabelEncoder()
all_df[feat] = lbe.fit_transform(all_df[feat]) + 1 # 让id从1开始0可能会被做掩码
feature_max_index_dict[feat] = all_df[feat].max() + 1
train_df = all_df
feature_names = train_df.columns
train_input_dict = {}
for name in feature_names:
train_input_dict[name] = np.array(train_df[name].values)
train_label = np.array(train_df['click'])
train_df.pop('click')
return feature_max_index_dict, train_input_dict, train_label