import warnings warnings.filterwarnings("ignore") import itertools import pandas as pd import numpy as np from tqdm import tqdm from collections import namedtuple import tensorflow as tf from tensorflow.keras.layers import * from tensorflow.keras.models import * from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler, LabelEncoder from utils import SparseFeat, DenseFeat, VarLenSparseFeat # 构建输入层 # 将输入的数据转换成字典的形式,定义输入层的时候让输入层的name和字典中特征的key一致,就可以使得输入的数据和对应的Input层对应 def build_input_layers(feature_columns): input_layer_dict = {} for fc in feature_columns: if isinstance(fc, SparseFeat): input_layer_dict[fc.name] = Input(shape=(1,), name=fc.name) elif isinstance(fc, DenseFeat): input_layer_dict[fc.name] = Input(shape=(fc.dimension, ), name=fc.name) elif isinstance(fc, VarLenSparseFeat): input_layer_dict[fc.name] = Input(shape=(fc.maxlen, ), name=fc.name) return input_layer_dict # 构建embedding层 def build_embedding_layers(feature_columns, input_layer_dict): embedding_layer_dict = {} for fc in feature_columns: if isinstance(fc, SparseFeat): embedding_layer_dict[fc.name] = Embedding(fc.vocabulary_size, fc.embedding_dim, name='emb_' + fc.name) elif isinstance(fc, VarLenSparseFeat): embedding_layer_dict[fc.name] = Embedding(fc.vocabulary_size + 1, fc.embedding_dim, name='emb_' + fc.name, mask_zero=True) return embedding_layer_dict def embedding_lookup(feature_columns, input_layer_dict, embedding_layer_dict): embedding_list = [] for fc in feature_columns: _input = input_layer_dict[fc] _embed = embedding_layer_dict[fc] embed = _embed(_input) embedding_list.append(embed) return embedding_list class Dice(Layer): def __init__(self): super(Dice, self).__init__() self.bn = BatchNormalization(center=False, scale=False) def build(self, input_shape): self.alpha = self.add_weight(shape=(input_shape[-1],), dtype=tf.float32, name='alpha') def call(self, x): x_normed = self.bn(x) x_p = tf.sigmoid(x_normed) return self.alpha * (1.0-x_p) * x + x_p * x class LocalActivationUnit(Layer): def __init__(self, hidden_units=(256, 128, 64), activation='prelu'): super(LocalActivationUnit, self).__init__() self.hidden_units = hidden_units self.linear = Dense(1) self.dnn = [Dense(unit, activation=PReLU() if activation == 'prelu' else Dice()) for unit in hidden_units] def call(self, inputs): # query: B x 1 x emb_dim keys: B x len x emb_dim query, keys = inputs # 获取序列长度 keys_len = keys.get_shape()[1] queries = tf.tile(query, multiples=[1, keys_len, 1]) # (None, len, emb_dim) # 将特征进行拼接 att_input = tf.concat([queries, keys, queries - keys, queries * keys], axis=-1) # B x len x 4*emb_dim # 将原始向量与外积结果拼接后输入到一个dnn中 att_out = att_input for fc in self.dnn: att_out = fc(att_out) # B x len x att_out att_out = self.linear(att_out) # B x len x 1 att_out = tf.squeeze(att_out, -1) # B x len return att_out class AttentionPoolingLayer(Layer): def __init__(self, att_hidden_units=(256, 128, 64)): super(AttentionPoolingLayer, self).__init__() self.att_hidden_units = att_hidden_units self.local_att = LocalActivationUnit(self.att_hidden_units) def call(self, inputs): # keys: B x len x emb_dim, queries: B x 1 x emb_dim queries, keys = inputs # 获取行为序列embedding的mask矩阵,将Embedding矩阵中的非零元素设置成True, key_masks = tf.not_equal(keys[:,:,0], 0) # B x len # key_masks = keys._keras_mask # tf的有些版本不能使用这个属性,2.1是可以的,2.4好像不行 # 获取行为序列中每个商品对应的注意力权重 attention_score = self.local_att([queries, keys]) # B x len # 去除最后一个维度,方便后续理解与计算 # outputs = attention_score # 创建一个padding的tensor, 目的是为了标记出行为序列embedding中无效的位置 paddings = tf.zeros_like(attention_score) # B x len # outputs 表示的是padding之后的attention_score outputs = tf.where(key_masks, attention_score, paddings) # B x len # 将注意力分数与序列对应位置加权求和,这一步可以在 outputs = tf.expand_dims(outputs, axis=1) # B x 1 x len # keys : B x len x emb_dim outputs = tf.matmul(outputs, keys) # B x 1 x dim outputs = tf.squeeze(outputs, axis=1) return outputs def get_dnn_logits(dnn_input, hidden_units=(200, 80), activation='prelu'): dnns = [Dense(unit, activation=PReLU() if activation == 'prelu' else Dice()) for unit in hidden_units] dnn_out = dnn_input for dnn in dnns: dnn_out = dnn(dnn_out) # 获取logits dnn_logits = Dense(1, activation='sigmoid')(dnn_out) return dnn_logits # 输入层拼接成列表 def concat_input_list(input_list): feature_nums = len(input_list) if feature_nums > 1: return Concatenate(axis=1)(input_list) elif feature_nums == 1: return input_list[0] else: return None # 将所有的sparse特征embedding拼接 def concat_embedding_list(feature_columns, input_layer_dict, embedding_layer_dict, flatten=False): embedding_list = [] for fc in feature_columns: _input = input_layer_dict[fc.name] # 获取输入层 _embed = embedding_layer_dict[fc.name] # B x 1 x dim 获取对应的embedding层 embed = _embed(_input) # B x dim 将input层输入到embedding层中 # 是否需要flatten, 如果embedding列表最终是直接输入到Dense层中,需要进行Flatten,否则不需要 if flatten: embed = Flatten()(embed) embedding_list.append(embed) return embedding_list def DIN(feature_columns, behavior_feature_list, behavior_seq_feature_list): # 构建Input层 input_layer_dict = build_input_layers(feature_columns) # 将Input层转化成列表的形式作为model的输入 input_layers = list(input_layer_dict.values()) # 筛选出特征中的sparse特征和dense特征,方便单独处理 sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeat), feature_columns)) dense_feature_columns = list(filter(lambda x: isinstance(x, DenseFeat), feature_columns)) # 获取dense dnn_dense_input = [] for fc in dense_feature_columns: dnn_dense_input.append(input_layer_dict[fc.name]) # 将所有的dense特征拼接 dnn_dense_input = concat_input_list(dnn_dense_input) # 构建embedding字典 embedding_layer_dict = build_embedding_layers(feature_columns, input_layer_dict) # 因为这里最终需要将embedding拼接后直接输入到全连接层(Dense)中, 所以需要Flatten dnn_sparse_embed_input = concat_embedding_list(sparse_feature_columns, input_layer_dict, embedding_layer_dict, flatten=True) # 将所有sparse特征的embedding进行拼接 dnn_sparse_input = concat_input_list(dnn_sparse_embed_input) # 获取当前的行为特征(movie)的embedding,这里有可能有多个行为产生了行为序列,所以需要使用列表将其放在一起 query_embed_list = embedding_lookup(behavior_feature_list, input_layer_dict, embedding_layer_dict) # 获取行为序列(movie_id序列, hist_movie_id) 对应的embedding,这里有可能有多个行为产生了行为序列,所以需要使用列表将其放在一起 keys_embed_list = embedding_lookup(behavior_seq_feature_list, input_layer_dict, embedding_layer_dict) # 使用注意力机制将历史movie_id序列进行池化 dnn_seq_input_list = [] for i in range(len(keys_embed_list)): seq_emb = AttentionPoolingLayer()([query_embed_list[i], keys_embed_list[i]]) dnn_seq_input_list.append(seq_emb) # 将多个行为序列attention poolint 之后的embedding进行拼接 dnn_seq_input = concat_input_list(dnn_seq_input_list) # 将dense特征,sparse特征,及通过注意力加权的序列特征拼接 dnn_input = Concatenate(axis=1)([dnn_dense_input, dnn_sparse_input, dnn_seq_input]) # 获取最终dnn的logits dnn_logits = get_dnn_logits(dnn_input, activation='prelu') model = Model(input_layers, dnn_logits) return model if __name__ == "__main__": # 读取数据 samples_data = pd.read_csv("./data/movie_sample.txt", sep="\t", header = None) samples_data.columns = ["user_id", "gender", "age", "hist_movie_id", "hist_len", "movie_id", "movie_type_id", "label"] # samples_data = shuffle(samples_data) X = samples_data[["user_id", "gender", "age", "hist_movie_id", "hist_len", "movie_id", "movie_type_id"]] y = samples_data["label"] X_train = {"user_id": np.array(X["user_id"]), \ "gender": np.array(X["gender"]), \ "age": np.array(X["age"]), \ "hist_movie_id": np.array([[int(i) for i in l.split(',')] for l in X["hist_movie_id"]]), \ "hist_len": np.array(X["hist_len"]), \ "movie_id": np.array(X["movie_id"]), \ "movie_type_id": np.array(X["movie_type_id"])} y_train = np.array(y) feature_columns = [SparseFeat('user_id', max(samples_data["user_id"])+1, embedding_dim=8), SparseFeat('gender', max(samples_data["gender"])+1, embedding_dim=8), SparseFeat('age', max(samples_data["age"])+1, embedding_dim=8), SparseFeat('movie_id', max(samples_data["movie_id"])+1, embedding_dim=8), SparseFeat('movie_type_id', max(samples_data["movie_type_id"])+1, embedding_dim=8), DenseFeat('hist_len', 1)] feature_columns += [VarLenSparseFeat('hist_movie_id', vocabulary_size=max(samples_data["movie_id"])+1, embedding_dim=8, maxlen=50)] # 行为特征列表,表示的是基础特征 behavior_feature_list = ['movie_id'] # 行为序列特征 behavior_seq_feature_list = ['hist_movie_id'] history = DIN(feature_columns, behavior_feature_list, behavior_seq_feature_list) history.compile('adam', 'binary_crossentropy') history.fit(X_train, y_train, batch_size=64, epochs=5, validation_split=0.2, )