From cd0bd4ac5ea72b9931744dd0e9ded970c88805e7 Mon Sep 17 00:00:00 2001 From: xLyons <75569795+Lyons-T@users.noreply.github.com> Date: Thu, 24 Feb 2022 22:50:53 +0800 Subject: [PATCH] Add files via upload --- .../create_ctr_data.cpython-39.pyc | Bin 0 -> 2067 bytes .../news_data_process.cpython-39.pyc | Bin 0 -> 5572 bytes .../dataset/data_process/create_ctr_data.py | 53 ++++ .../dataset/data_process/news_data_process.py | 163 +++++++++++ .../data_process/train&test_data_split.py | 46 +++ .../data_process/user&doc_data_process.py | 103 +++++++ .../raw_data/将原始文件存放在该目录下.txt | 13 + .../recprocess/rank/examples/deepfm_news.py | 29 ++ .../rank/examples/deepfm_ppnet_news.py | 29 ++ .../rank/examples/set_para/deepfm_news.yaml | 14 + .../examples/set_para/deepfm_ppnet_news.yaml | 16 + .../recprocess/rank/layers/__init__.py | 6 + .../__pycache__/__init__.cpython-39.pyc | Bin 0 -> 120 bytes .../__pycache__/activation.cpython-39.pyc | Bin 0 -> 643 bytes .../layers/__pycache__/core.cpython-39.pyc | Bin 0 -> 7751 bytes .../__pycache__/embedding.cpython-39.pyc | Bin 0 -> 1049 bytes .../__pycache__/interaction.cpython-39.pyc | Bin 0 -> 1156 bytes .../recprocess/rank/layers/activation.py | 24 ++ .../recprocess/rank/layers/core.py | 238 +++++++++++++++ .../recprocess/rank/layers/embedding.py | 41 +++ .../recprocess/rank/layers/interaction.py | 32 ++ .../recprocess/rank/model_tools/__init__.py | 6 + .../__pycache__/__init__.cpython-39.pyc | Bin 0 -> 125 bytes .../feature_columns.cpython-39.pyc | Bin 0 -> 3284 bytes .../rank/model_tools/feature_columns.py | 102 +++++++ .../recprocess/rank/models/__init__.py | 6 + .../__pycache__/__init__.cpython-39.pyc | Bin 0 -> 119 bytes .../models/__pycache__/deepfm.cpython-39.pyc | Bin 0 -> 1726 bytes .../__pycache__/deepfm_ppnet.cpython-39.pyc | Bin 0 -> 2042 bytes .../recprocess/rank/models/deepfm.py | 63 ++++ .../recprocess/rank/models/deepfm_ppnet.py | 77 +++++ .../news_rec_server/recprocess/rank/readme.md | 273 ++++++++++++++++++ .../__pycache__/run_deepfm.cpython-39.pyc | Bin 0 -> 1072 bytes .../run_deepfm_ppnet.cpython-39.pyc | Bin 0 -> 1142 bytes .../recprocess/rank/run_train/run_deepfm.py | 39 +++ .../rank/run_train/run_deepfm_ppnet.py | 42 +++ .../__pycache__/set_parament.cpython-39.pyc | Bin 0 -> 532 bytes .../recprocess/rank/utils/data_compression.py | 44 +++ .../recprocess/rank/utils/set_device.py | 16 + .../recprocess/rank/utils/set_parament.py | 20 ++ 40 files changed, 1495 insertions(+) create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/examples/dataset/data_process/__pycache__/create_ctr_data.cpython-39.pyc create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/examples/dataset/data_process/__pycache__/news_data_process.cpython-39.pyc create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/examples/dataset/data_process/create_ctr_data.py create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/examples/dataset/data_process/news_data_process.py create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/examples/dataset/data_process/train&test_data_split.py create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/examples/dataset/data_process/user&doc_data_process.py create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/examples/dataset/raw_data/将原始文件存放在该目录下.txt create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/examples/deepfm_news.py create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/examples/deepfm_ppnet_news.py create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/examples/set_para/deepfm_news.yaml create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/examples/set_para/deepfm_ppnet_news.yaml create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/layers/__init__.py create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/layers/__pycache__/__init__.cpython-39.pyc create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/layers/__pycache__/activation.cpython-39.pyc create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/layers/__pycache__/core.cpython-39.pyc create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/layers/__pycache__/embedding.cpython-39.pyc create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/layers/__pycache__/interaction.cpython-39.pyc create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/layers/activation.py create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/layers/core.py create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/layers/embedding.py create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/layers/interaction.py create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/model_tools/__init__.py create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/model_tools/__pycache__/__init__.cpython-39.pyc create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/model_tools/__pycache__/feature_columns.cpython-39.pyc create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/model_tools/feature_columns.py create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/models/__init__.py create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/models/__pycache__/__init__.cpython-39.pyc create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/models/__pycache__/deepfm.cpython-39.pyc create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/models/__pycache__/deepfm_ppnet.cpython-39.pyc create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/models/deepfm.py create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/models/deepfm_ppnet.py create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/readme.md create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/run_train/__pycache__/run_deepfm.cpython-39.pyc create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/run_train/__pycache__/run_deepfm_ppnet.cpython-39.pyc create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/run_train/run_deepfm.py create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/run_train/run_deepfm_ppnet.py create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/utils/__pycache__/set_parament.cpython-39.pyc create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/utils/data_compression.py create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/utils/set_device.py create mode 100644 codes/news_recsys/news_rec_server/recprocess/rank/utils/set_parament.py diff --git a/codes/news_recsys/news_rec_server/recprocess/rank/examples/dataset/data_process/__pycache__/create_ctr_data.cpython-39.pyc b/codes/news_recsys/news_rec_server/recprocess/rank/examples/dataset/data_process/__pycache__/create_ctr_data.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cc6a3218d2b1cf01578174f93ba7171a91b58c81 GIT binary patch literal 2067 zcmbtVOK%)S5bo}I?Ck7%y*LoZ32!1xgOCZ~5M)^vLRO9u!jTXwqtSS~*Is*Gbk7Fc zn!VsiULPVrTsYbnB%e4V{z)G}0r||05TL5ZYcC0KNsqevQQcitUp*$8n(_%WdvtmI zLywT3aWUB}SiBEieE@_LPGi!kM%rO!ZFOuIS?nZk$EC=#;#%T$yrkZ#!@9;D?%t;z zpVL(mc&G3Vxf4)bQ=5ZcFBSpQ{%RgcC2otT(DfCOR@1v(6h3YZn1F%=_f&@xQjikV znFZ1N?92I~RnQ|=TE}#a9g?A4T1VC~q2yyw$cAsCVx7xdXzVK^S)QQD#-8<1ND0rIO)=zJ^w2*Jdn#uJ+ z6F~pv+wUKK_1&*u{P6JL$J4){?X?P_iZG7WMQlzw{gjIxZO5WlfLE=}>~qbsRJS;~ zH0*(U2T~|Kt&F?RH@e`>^TuFSgMJCRdKE|?{QT*aAuVY^j_8P#fcgU!VPVwQ6f&L_6iyG(~$S%6ukNS;>zOM?LqpH=&p&K zD9K}?*3e=l3Nwbe%(_CUwXOuMMc6H5SRt~j{UzS-7Tqk#mtTRmeuGZ--gxrhQk-?8 zSS>GHoPIVtjpDKe!sHAPu&HM7KQl?rBIgQ_*Kjfimo*1ry_z>*$seLViTICPJP=Q8C&& zV{PTja79Dnp{`D)t1psu!9xsuU9aY#z4q@xK(2?Xzb7Py2$+VoR3c{DN}?V4I_@&< zMR9C`UE7h|P)|U-0EZMtq3f_mXpl`?l>HIZwVmZ6)o$MJZpQ+uKo)WBbmL43iAk+l z&UFLQiHB!LoJ7bCq1}5?JP;~q0LqYpVICEm@=dtW@di*fFxfQi>E>De2>WR->ohKE zkJ3%lcwAmO%_`OVX+9{_cx9@TI=gnpYLBmk-7Fp?X=Q3)OLUsiKwiVyWh9thdiseh z$K+5qfe_jT80IYx#x`rwtE>&c%upYClU1Y7T+4mzIxWY2K!5Qu1wJbfy%*8Od~EQ1oL}L3u67xQN3d%VM=K#${}T@hS$V5it>z=aH$s VC3s*Ei)H+yn(*`}eofZ4{swKl<}m;O literal 0 HcmV?d00001 diff --git a/codes/news_recsys/news_rec_server/recprocess/rank/examples/dataset/data_process/__pycache__/news_data_process.cpython-39.pyc b/codes/news_recsys/news_rec_server/recprocess/rank/examples/dataset/data_process/__pycache__/news_data_process.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ccba4681b020d3540eb826c02063f9b7261fcdb8 GIT binary patch literal 5572 zcmbVQ>yI1N6`#2?w#Q!E>wP~GLQ+z|rCnZ1A9S073Mp-)RBG~Rl*pR(o%PPHJ@(!i zlk9dZrEKU1sY{bWfI<+v5#^y)Nhk;iAt3cb|A)@kvg_<8K2-Z9ADaHo9eZDiQiQF! z&vVW_=iGbG<64SBrXkyUIax`WDcO#dJB}zk&J(8;GtCX2M4jOo-oewSR5Q!f!%9B8iglDj zc{SAiXTN!{-TAQG8`m4OFfU zfIqHreL-umlEPyb)PNOO#EGv3j2lDBocbg_YL6;BNj%D#)TWu#C{ffwu|rZ!^9(3z zpqK?kt-u0pG8SmOb4Xc?2a2o{OUjZ8ohZ+-aW@!Y7g396r<7Snuy51zt$nD5^*Kg5Sia{bB7WZKX2 z9en2kxaGrG|E`v-8sD`pE84(UCOd?tLd~;lX3}xFeWFmTmh*`)5kd2{G$5{D{CxSX`Q9YfLM9{=CzK$+M7+ zJY*VH$(DkN9PnjvCmI<(6+1fKs$Q@2G1Z_X?XwMrt!*YP9Tlez97q z?I#ZYgHM^gXWX{@y0F`*BctVN(JFiU(P%@9EFiy`I1#U1z|S+$gUU2KXrK0HLk!W2 z3DJjQh#LJ=bfTC|^J-DTj`%111E64F)J&DD#|ss^QWZ0_g^q7m3J$N+Caf&ZuU>dn zZr<`A&#t^Yzw*jQtFL?*4ZQw&^ZEIebC*}=|0oB({N!ZwQ|#35o9|zZ20s16#sKa2 z@^c@ozI5^Wr3Ep7o*=E6i6>kxst!LAvWiECs_qvqr&ExT_kPIDHgEjg<{#ZT(UAJw&^J}M$1-Zj9dF*F(YNo z?zzJzMzCU-fOnu+B`!c%w@x_T7WRlWvSN}w+7fw2({x7ZghuC6asy`5CEG7d*)xUX zRl!?(8^);PZDUw&?7bitZR}38LcMH{`+`(V8l0#e4_VdC$0IebvnmKys!^&wp9@)y zhla2%Uck)`6S&0nit7pD+)T-F-x@31p@u^o>W=5QVH{gCUAN7+#?!5F z7O=$x&y-YnC-NNz&r*9axda~tj#Kaso$$@Mmc{SjU0ii&wQjlE?WyX4zKv?n|DoD@ z2dZruVnJ-3YM>z)dhA&n44(SVZ zF1_f6z9&22M>pvN9&zUdwUGeE#@^l~_wz1roq=YJz*x*KDXQXk`nkaH<9=6Qkms3W zb7~_AOoHqVl5$=6V84gU8{ckx8MGM6-6d9vQ`B zeuFjAL1M8lfCN|#C&KR!l8XaNFm^mOpv*Dg9cX0ase#`QGE=bpjckxzOb`v8Aw36a ze$O~t*H<>k1X&pwbbxY}D4$`|ula+ELqW%q+US($rjvIDDZZ;jQI|YDfEnWhjojCi zrE6W5E-B02Zzv1mD#!vFfsC~v%eOblqCu7@${|@aDT_8eej8c3w#cGQk8t)iW%);A zyDaxYmTtLEk!<&&-y`>_n>diIo;&Q=!CQ80yOiA|WzfU#hpeywt^0hdtoro9uOaIV zoR7pfgL}*$k|Eq0g$OGjoxc9z74qxnzWD01)6IAO^wnq2M-Kk|bIlhnM;>1~jJk~3 zWCXDM@x?WA-%(WHSC;|%)~AIC7?Vff!oG)jgcEM9Y!&S_GRp+jrU=--tek&Skk?(M zSYwrZrSxHWwssfV&6nO?IsNg<$*Y@LEL{KM?887s@L8{j`v~QJDxN@*SH(}LVr#om zMd-^{K3INZemjLniGp51NFd0zOFs!T@{}#Kh*PY6?lLX8j}T(TYTfmvZdP8sw0!36 z=CfCp=dW%-jE0Em4McH()_9tVr%Ak(7ReC<@UV1(CvVLq=*St7oEK{%r*LY;hU zsBvp%BlHN>f|)H$Agn~_2A>P&(mfi*lws*teOlvLVdDIQ3e7B+XX)r>qHoT#wyos zPdp%LB6uiTe!f@yfVwGPSdJ?L2Qv}1A4axmS8DC)DjQQ0NoJNY5Uk4O9g%73vgTmG8R|;~m3{fkz$HKU8A*}ID zRzikED3R6;*@TX_2Qh;(Fp>=U4)TRUE2!}vhLGOdC=}Mub|Ql5S998+Hb}7yqL`f8 zefxG4?GOYapl-FB8Cp(Fs|N79S+|QIdYK7q=D2Q$v&{~9D!6jxtQV^I1h}%h0D3g ze{4{3aVA0ADCr8mj+TX>yORRwaDbNO#2Cycxl+>S6 zf*3tKVBJ0$SfO`pxDod|7jtl=EI`mVKq)<1qssJk1R4D@$VdQgFW|c7qOhVnf?e$K zdzTQ%Met|T>0A&C5`5r1LwLc0q%`J(hi1DrE04E7Q)5G~Czi zcPKSVMn&f1elkkYds^DGd|My92GmBUW2W(nAu_CUT*nl$lUB0=6#K$VK1d=beOrChO^ za9pHO-h@@N1=)hgDnClCUr|9hi@bE=2wEX?$)HwCf}E+D8F%~w&W7vZd7uKL>iILE zLu$^a#19EUuh8=wnNl%EDBWWgINV|r@vOJmL<&xD*_JGAyUEI9VY(&Q?VcmTp4)81 zpFznZHWZaoH3O!kpHeb46(wo$986wHO=|`W%piT50jRjTQ)4RoZ#*V`3tXHJI*|EJ zu@6-kL%Ls^5p`hP8_!OeJ6coY!sNZohJp47eB*B+}0yY=|v zxGin&jPlaxNob@kYYS@AMi5V%&Zc}_qD&Gm4N!KyZ11O>(xY7T-w6N! literal 0 HcmV?d00001 diff --git a/codes/news_recsys/news_rec_server/recprocess/rank/examples/dataset/data_process/create_ctr_data.py b/codes/news_recsys/news_rec_server/recprocess/rank/examples/dataset/data_process/create_ctr_data.py new file mode 100644 index 00000000..a23af03b --- /dev/null +++ b/codes/news_recsys/news_rec_server/recprocess/rank/examples/dataset/data_process/create_ctr_data.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- +# @File : create_ctr_data.py +# @Author: xLyons +# @IDE : PyCharm +# @Time : 2022/2/7 + +import pickle +import numpy as np +import pandas as pd +from sklearn.utils import shuffle + +from model_tools.feature_columns import SparseFeat, DenseFeat + + +def create_ctr_data(data_path, args, use_dict=True): + with open(data_path + 'data.pkl', 'rb') as f: + all_data, feature_info = pickle.load(f) + f.close() + + # 训练数据和测试数据 + all_data = shuffle(all_data) + train_df = all_data[all_data['是否点击'] != -1] + test_df = all_data[all_data['是否点击'] == -1] + # 测试数据的标签 + test_labels = pd.read_pickle(data_path + 'test_label.pkl') + test_labels = pd.merge(test_df[['index']], test_labels, how='left', on=['index']) + + all_features = feature_info['dense_features'] + feature_info['sparse_features'] + if use_dict: + train_inputs = {name: np.array(train_df[name].tolist()) for name in all_features} + train_labels = train_df['是否点击'].values + test_inputs = {name: np.array(test_df[name].tolist()) for name in all_features} + test_labels = test_labels['是否点击'].values + else: + train_inputs = [np.array(train_df[name]) for name in all_features] + train_labels = train_df['是否点击'].values + test_inputs = [np.array(test_df[name]) for name in all_features] + test_labels = test_labels['是否点击'].values + + features_columns = [DenseFeat(name=feat, + dimension=1, + dtype='float32',) + for feat in feature_info['dense_features']] + + features_columns += [SparseFeat(name=feat, + embed_name=feat, + embed_dim=args.embed_dim, + vocab_size=all_data[feat].max()+1, + dtype='int32',) + for feat in feature_info['sparse_features']] + + return (train_inputs, train_labels), (test_inputs, test_labels), features_columns diff --git a/codes/news_recsys/news_rec_server/recprocess/rank/examples/dataset/data_process/news_data_process.py b/codes/news_recsys/news_rec_server/recprocess/rank/examples/dataset/data_process/news_data_process.py new file mode 100644 index 00000000..6dc8ef8c --- /dev/null +++ b/codes/news_recsys/news_rec_server/recprocess/rank/examples/dataset/data_process/news_data_process.py @@ -0,0 +1,163 @@ +#!/usrbin/env python +# -*- coding:utf-8 -*- +# @File : news_data_process.py +# @Author: xLyons +# @IDE : PyCharm +# @Time : 2022/2/7 + +import os +import gc +import swifter +import pickle +import numpy as np +import pandas as pd + +from tqdm.auto import tqdm +from sklearn.preprocessing import LabelEncoder, StandardScaler +from utils.data_compression import reduce_mem + + +def get_statistical_features(all_data, past_day=7): + # 统计新闻从发文到展示的日期差 + temp = all_data['展现日期'] - all_data['发文日期'] + all_data['从发文到展现的日期差'] = temp.dt.days + all_data.loc[all_data['从发文到展现的日期差'] < 0, '从发文到展现的日期差'] = 0 + all_data.fillna(value={'从发文到展现的日期差': 0}, inplace=True) + + statis_dense_columns = ['从发文到展现的日期差'] + + dates = all_data['展现日期'].unique() + dates.sort() + date_num = len(dates) + date_map = dict(zip(dates, range(date_num))) + all_data['展现日期_idx'] = all_data['展现日期'].map(date_map) + + train_data = all_data[all_data['是否点击'] != -1] + + # =================================================================================== + for feat in tqdm([['user_id'], ['item_id'], ['一级分类'], ['二级分类'], + ['user_id', '一级分类'], ['user_id', '二级分类']]): + res_arr = [] + name = f'过去{past_day}天_特征({"_".join(feat)})_展现总数' + statis_dense_columns.append(name) + + for day in range(0, date_num): + train_data_temp = train_data[ + (train_data['展现日期_idx'] >= day-past_day) & (train_data['展现日期_idx'] < day)] + train_data_temp = train_data_temp.groupby(feat)['item_id'].agg([ + (name, 'count')]).reset_index() + train_data_temp['展现日期_idx'] = day + res_arr.append(train_data_temp) + stat_all_data = pd.concat(res_arr) + all_data = all_data.merge(stat_all_data, how='left', on=feat + ['展现日期_idx']) + + target = '是否点击' + for feat in tqdm([['user_id'], ['item_id'], ['一级分类'], ['二级分类'], + ['user_id', '一级分类'], ['user_id', '二级分类']]): + res_arr = [] + name_mean = f'过去{past_day}天_特征({"_".join(feat)})_点击率mean' + name_sum = f'过去{past_day}天_特征({"_".join(feat)})_点击总数sum' + + statis_dense_columns.append(name_mean) + statis_dense_columns.append(name_sum) + + for day in range(0, date_num): + train_data_temp = train_data[ + (train_data['展现日期_idx'] >= day-past_day) & (train_data['展现日期_idx'] < day)] + train_data_temp = train_data_temp.groupby(feat)[target].agg( + [(name_mean, 'mean'), (name_sum, 'sum')]).reset_index() + train_data_temp['展现日期_idx'] = day + res_arr.append(train_data_temp) + stat_all_data = pd.concat(res_arr) + all_data = all_data.merge(stat_all_data, how='left', on=feat + ['展现日期_idx']) + + target = '消费时长(秒)' + for feat in tqdm([['user_id'], ['item_id'], ['一级分类'], ['二级分类'], + ['user_id', '一级分类'], ['user_id', '二级分类']]): + res_arr = [] + name_mean = f'过去{past_day}天_特征({"_".join(feat)})_消费时长mean' + name_std = f'过去{past_day}天_特征({"_".join(feat)})_消费时长std' + name_sum = f'过去{past_day}天_特征({"_".join(feat)})_消费时长sum' + statis_dense_columns.append(name_mean) + statis_dense_columns.append(name_std) + statis_dense_columns.append(name_sum) + + for day in range(0, date_num): + train_data_temp = train_data[ + (train_data['展现日期_idx'] >= day-past_day) & (train_data['展现日期_idx'] < day)] + train_data_temp = train_data_temp.groupby(feat)[target].agg( + [(name_mean, 'mean'), (name_std, 'std'), (name_sum, 'sum')] + ).reset_index() + train_data_temp['展现日期_idx'] = day + res_arr.append(train_data_temp) + stat_all_data = pd.concat(res_arr) + all_data = all_data.merge(stat_all_data, how='left', on=feat + ['展现日期_idx']) + + return all_data, statis_dense_columns + + +def main(): + raw_data_path = '../raw_data' + new_data_path = '../new_data' + os.makedirs(new_data_path, exist_ok=True) + + train_data_path = os.path.join(raw_data_path, 'train_data.pkl') + test_data_path = os.path.join(raw_data_path, 'test_data.pkl') + + train_data = pd.read_pickle(train_data_path) + test_data = pd.read_pickle(test_data_path) + test_data['是否点击'] = -1 + all_data = pd.concat([train_data, test_data]) + + # 1. 合并用户特征 + user_path = os.path.join(new_data_path, 'user_info_5w.pkl') + user_info = pd.read_pickle(user_path) + all_data = all_data.merge( + user_info[['user_id', '设备名称', '操作系统', '所在省', '所在市', '年龄', '性别']], + how='left', on='user_id' + ) + del user_info + gc.collect() + + # 2. 合并文档特征 + doc_path = os.path.join(new_data_path, 'doc_info.pkl') + doc_info = pd.read_pickle(doc_path) + all_data = all_data.merge( + doc_info[['item_id', '一级分类', '二级分类', '关键词', '图片数量', '发文时间', '发文日期']], + how='left', on='item_id' + ) + del doc_info + gc.collect() + + # 3. 获取统计特征 + all_data, statis_dense_columns = get_statistical_features(all_data) + + # 4. 连续特征处理 + base_dense_columns = ['刷新次数', '图片数量'] + dense_columns = base_dense_columns + statis_dense_columns + + all_data.fillna(value={feat: 0 for feat in dense_columns}, inplace=True) + # sc = StandardScaler() + # all_data[dense_columns] = sc.fit_transform(all_data[dense_columns]) + for feat in dense_columns: + all_data[feat] = np.log(1 + all_data[feat]) + + # 5. 离散特征处理 + sparse_columns = ['user_id', 'item_id', '网路环境', '设备名称', '操作系统', '展现位置', + '所在省', '所在市', '年龄', '性别', '一级分类', '二级分类', '关键词'] + for feat in sparse_columns: + lb = LabelEncoder() + all_data[feat] = lb.fit_transform(all_data[feat].astype(str)) + + all_data = reduce_mem(all_data) + feature_info = {'dense_features': dense_columns, + 'sparse_features': sparse_columns} + file = [all_data, feature_info] + file_save_path = os.path.join(new_data_path, 'data.pkl') + with open(file_save_path, 'wb') as f: + pickle.dump(file, f) + f.close() + + +if __name__ == '__main__': + main() diff --git a/codes/news_recsys/news_rec_server/recprocess/rank/examples/dataset/data_process/train&test_data_split.py b/codes/news_recsys/news_rec_server/recprocess/rank/examples/dataset/data_process/train&test_data_split.py new file mode 100644 index 00000000..515eba88 --- /dev/null +++ b/codes/news_recsys/news_rec_server/recprocess/rank/examples/dataset/data_process/train&test_data_split.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- +# @File : train&test_data_split.py +# @Author: xLyons +# @IDE : PyCharm +# @Time : 2022/2/16 + + +import os +import pandas as pd + + +def main(): + raw_data_path = '../raw_data' + new_data_path = '../new_data' + + # 1. 数据读取 + all_data_path = os.path.join(raw_data_path, 'train_data_5w.csv') + all_data = pd.read_csv(all_data_path, sep='\t', index_col=0) # .sample(n=100000) + all_data.columns = ['user_id', 'item_id', '展现时间', '网路环境', '刷新次数', '展现位置', '是否点击', '消费时长(秒)'] + print(f'样本总数为:{all_data.shape[0]}') + + # 2. 数据处理 + all_data.loc[all_data['消费时长(秒)'] < 0, '消费时长(秒)'] = 0 + all_data['展现时间'] = pd.to_datetime( + all_data.loc[:, '展现时间'], utc=True, unit='ms').dt.tz_convert('Asia/Shanghai') + all_data['展现日期'] = all_data['展现时间'].dt.date + all_data['index'] = range(all_data.shape[0]) + + dates = all_data['展现日期'].unique() + dates.sort() + # 3. 训练、测试数据集划分 + train_data = all_data[all_data['展现日期'] != dates[-1]] + test_data = all_data[all_data['展现日期'] == dates[-1]] + test_label = test_data[['index', '是否点击']] + + # 4. 测试集处理 + test_data = test_data.drop(columns=['消费时长(秒)', '展现位置', '是否点击']) + + train_data.to_pickle(os.path.join(raw_data_path, 'train_data.pkl')) + test_data.to_pickle(os.path.join(raw_data_path, 'test_data.pkl')) + test_label.to_pickle(os.path.join(new_data_path, 'test_label.pkl')) + + +if __name__ == '__main__': + main() diff --git a/codes/news_recsys/news_rec_server/recprocess/rank/examples/dataset/data_process/user&doc_data_process.py b/codes/news_recsys/news_rec_server/recprocess/rank/examples/dataset/data_process/user&doc_data_process.py new file mode 100644 index 00000000..deb8966b --- /dev/null +++ b/codes/news_recsys/news_rec_server/recprocess/rank/examples/dataset/data_process/user&doc_data_process.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- +# @File : user&doc_data_process.py +# @Author: xLyons +# @IDE : PyCharm +# @Time : 2022/2/9 + +import os +import swifter +import pandas as pd +import numpy as np + +from tqdm.auto import tqdm + + +def prob2val(feat_info): + # 判断是否为空 + if feat_info == feat_info: + prob_list = [values.split(':') for values in feat_info.split(',')] + prob_list = sorted(prob_list, key=lambda x: float(x[1])) + return prob_list[-1][0] + else: + return np.NaN + + +def get_second_title(x): + if x['二级分类'] == x['二级分类']: + second_titles = x['二级分类'].split('/') + for title in second_titles: + # 跳过异常数据 + if title == 'A_0_24:0.447656,A_25_29:0.243809,A_30_39:0.076268,A_40+:0.232267': + continue + # 优先返回不等于一级分类的二级分类 + if title != x['一级分类']: + return title + + return x['一级分类'] + + +def get_key_word(feat_info): + if feat_info == feat_info and isinstance(feat_info, str): + key_word_list = [values.split(':') for values in feat_info.replace('^', '').split(',')] + + new_list = [] + last_elem = '' + for idx, values in enumerate(key_word_list): + if len(values) == 1: + last_elem = values[0] if last_elem == '' else ','.join([last_elem, values[0]]) + continue + if len(values) > 2: + # 将类似于‘你好,李焕英’这种关键词重新进行拼接 + # 这类关键词由于存在逗号,在获取key_word_list时被误分开了 + values[0] = ':'.join(values[:-1]) + + values[0] = values[0] if last_elem == '' else ','.join([last_elem, values[0]]) + new_list.append(values) + last_elem = '' + + return new_list[-1][0] + else: + return np.NaN + + +def main(): + raw_data_path = '../raw_data' + new_data_path = '../new_data' + os.makedirs(new_data_path, exist_ok=True) + + # 1. 处理用户文件 + user_path = os.path.join(raw_data_path, 'user_info_5w.csv') + user_info = pd.read_csv(user_path, sep='\t', index_col=0) + user_info.columns = ['user_id', '设备名称', '操作系统', '所在省', '所在市', '年龄', '性别'] + + user_info['年龄'] = [prob2val(age_info) for age_info in tqdm(user_info['年龄'])] + user_info['性别'] = [prob2val(sex_info) for sex_info in tqdm(user_info['性别'])] + + user_info.to_pickle(os.path.join(new_data_path, 'user_info_5w.pkl')) + + # 2. 处理文档文件 + doc_path = os.path.join(raw_data_path, 'doc_info.txt') + doc_info = pd.read_table(doc_path, sep='\t', low_memory=False, header=None) + doc_info.columns = ['item_id', '标题', '发文时间', '图片数量', '一级分类', '二级分类', '关键词'] + + # 处理异常的发文时间数据 + condition_row = (doc_info['发文时间'].isnull()) | (doc_info['发文时间'] == 'Android') + time_fill_value = doc_info.loc[~condition_row, '发文时间'].swifter.apply(lambda x: int(x[:10])).astype('int').min() + doc_info.loc[condition_row, '发文时间'] = str(time_fill_value) + + doc_info['发文时间'] = pd.to_datetime( + doc_info.loc[:, '发文时间'], utc=True, unit='ms').dt.tz_convert('Asia/Shanghai') + doc_info['发文日期'] = doc_info['发文时间'].dt.date + + doc_info['图片数量'] = doc_info.loc[:, '图片数量'].swifter.apply( + lambda x: 0 if (x in ['上海', '云南', '山东'] or x != x) else int(x)) + + doc_info['二级分类'] = doc_info.loc[:, ['一级分类', '二级分类']].swifter.apply(get_second_title, axis=1) + doc_info['关键词'] = [get_key_word(words) for words in tqdm(doc_info['关键词'])] + + doc_info.to_pickle(os.path.join(new_data_path, 'doc_info.pkl')) + + +if __name__ == '__main__': + main() diff --git a/codes/news_recsys/news_rec_server/recprocess/rank/examples/dataset/raw_data/将原始文件存放在该目录下.txt b/codes/news_recsys/news_rec_server/recprocess/rank/examples/dataset/raw_data/将原始文件存放在该目录下.txt new file mode 100644 index 00000000..1bee8d38 --- /dev/null +++ b/codes/news_recsys/news_rec_server/recprocess/rank/examples/dataset/raw_data/将原始文件存放在该目录下.txt @@ -0,0 +1,13 @@ +原始数据集共包含3个,实验时存放在目录`rank/examples/dataset/raw_data/`下。 + ++ **user_info_5w.csv** + + 该文件共包含了5万条用户的个人数据; + + 特征分别包括了:['user_id', 'device', 'os', 'province', 'city', 'age', 'gender']; + + 各特征的含义为:['用户id', '设备名称', '操作系统', '所在省', '所在市', '年龄', '性别']; ++ **doc_info.txt** + + 该文件包含了所有新闻的特征数据; + + 各特征的含义为:['文档id', '标题', '发文时间', '图片数量', '一级分类', '二级分类', '关键词']; + ++ **train_data_5w.csv** + + 该文件为用户点击数据,包含了5万个用户在过去13天的点击数据; + + 各特征的含义为:['用户id', '文档id', '展现时间', '网路环境', '刷新次数', '展现位置', '是否点击', '消费时长(秒)']; \ No newline at end of file diff --git a/codes/news_recsys/news_rec_server/recprocess/rank/examples/deepfm_news.py b/codes/news_recsys/news_rec_server/recprocess/rank/examples/deepfm_news.py new file mode 100644 index 00000000..3beb3f60 --- /dev/null +++ b/codes/news_recsys/news_rec_server/recprocess/rank/examples/deepfm_news.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- +# @File : deepfm_news.py +# @Author: xLyons +# @IDE : PyCharm +# @Time : 2022/1/27 + +import argparse + +from run_train import run_deepfm +from utils.set_parament import get_args +from dataset.data_process.create_ctr_data import create_ctr_data + + +parser = argparse.ArgumentParser(description='Model Parameter') +parser.add_argument('--yaml_path', + default='./set_para/deepfm_news.yaml', + required=False) +parser.add_argument('--data_path', + default='./dataset/new_data/', + required=False) +parse_args = parser.parse_args() + + +if __name__ == '__main__': + args = get_args(parse_args.yaml_path) + train_data, test_data, feature_info = create_ctr_data(parse_args.data_path, args) + + run_deepfm.run(train_data, test_data, feature_info, args) diff --git a/codes/news_recsys/news_rec_server/recprocess/rank/examples/deepfm_ppnet_news.py b/codes/news_recsys/news_rec_server/recprocess/rank/examples/deepfm_ppnet_news.py new file mode 100644 index 00000000..2cb709d7 --- /dev/null +++ b/codes/news_recsys/news_rec_server/recprocess/rank/examples/deepfm_ppnet_news.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- +# @File : deepfm_ppnet_news.py +# @Author: xLyons +# @IDE : PyCharm +# @Time : 2022/2/8 + +import argparse + +from run_train import run_deepfm_ppnet +from utils.set_parament import get_args +from dataset.data_process.create_ctr_data import create_ctr_data + + +parser = argparse.ArgumentParser(description='Model Parameter') +parser.add_argument('--yaml_path', + default='./set_para/deepfm_ppnet_news.yaml', + required=False) +parser.add_argument('--data_path', + default='./dataset/new_data/', + required=False) +parse_args = parser.parse_args() + + +if __name__ == '__main__': + args = get_args(parse_args.yaml_path) + train_data, test_data, feature_info = create_ctr_data(parse_args.data_path, args) + + run_deepfm_ppnet.run(train_data, test_data, feature_info, args) diff --git a/codes/news_recsys/news_rec_server/recprocess/rank/examples/set_para/deepfm_news.yaml b/codes/news_recsys/news_rec_server/recprocess/rank/examples/set_para/deepfm_news.yaml new file mode 100644 index 00000000..f3d8cdb8 --- /dev/null +++ b/codes/news_recsys/news_rec_server/recprocess/rank/examples/set_para/deepfm_news.yaml @@ -0,0 +1,14 @@ +# data para +seed: 48 +# model para +embed_dim: 32 +drop_rate: 0.5 +use_bn: Ture +hidden_units: [64, 128, 64] +# compile para +learning_rate: 0.001 +epochs: 1 +batch_size: 2048 +val_splite: 0.1 +patience: 5 +restore_best_weights: True diff --git a/codes/news_recsys/news_rec_server/recprocess/rank/examples/set_para/deepfm_ppnet_news.yaml b/codes/news_recsys/news_rec_server/recprocess/rank/examples/set_para/deepfm_ppnet_news.yaml new file mode 100644 index 00000000..f5be149c --- /dev/null +++ b/codes/news_recsys/news_rec_server/recprocess/rank/examples/set_para/deepfm_ppnet_news.yaml @@ -0,0 +1,16 @@ +# data para +seed: 48 +# model para +embed_dim: 32 +drop_rate: 0.5 +ppnet_size: 256 +ppnet_features: ['user_id', '一级分类', '年龄'] +use_bn: Ture +hidden_units: [64, 128, 64] +# compile para +learning_rate: 0.001 +epochs: 1 +batch_size: 2048 +val_splite: 0.1 +patience: 5 +restore_best_weights: True \ No newline at end of file diff --git a/codes/news_recsys/news_rec_server/recprocess/rank/layers/__init__.py b/codes/news_recsys/news_rec_server/recprocess/rank/layers/__init__.py new file mode 100644 index 00000000..0cb6e9d2 --- /dev/null +++ b/codes/news_recsys/news_rec_server/recprocess/rank/layers/__init__.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- +# @File : __init__.py.py +# @Author: xLyons +# @IDE : PyCharm +# @Time : 2022/1/27 diff --git a/codes/news_recsys/news_rec_server/recprocess/rank/layers/__pycache__/__init__.cpython-39.pyc b/codes/news_recsys/news_rec_server/recprocess/rank/layers/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d541026cff4fc17bbc6c5bc9e660c1c6f42e89e1 GIT binary patch literal 120 zcmYe~<>g`k0=o{eq;??v7{oyaj6jY95Erumi4=xl22Do4l?+87VFd9@*2OBuFSWcl qC^b1IC$TcMs5mA*J~J<~BtBlRpz;=nO>TZlX-=vgNZV&1W&i+E>lh>e literal 0 HcmV?d00001 diff --git a/codes/news_recsys/news_rec_server/recprocess/rank/layers/__pycache__/activation.cpython-39.pyc b/codes/news_recsys/news_rec_server/recprocess/rank/layers/__pycache__/activation.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2d1fd991fc5e94cfb4064b945c56f8c78a9045b1 GIT binary patch literal 643 zcmZWm&1=*^6rVTw*lgI=J=If}Ts#CM^eiHksvu|&3W8xlW;3&xIy)mXlS)fOaK*pF z9{VrNwI}}tFZw1crr?A3%lr7fuMFGU%L&Tr`@!l9CFCYJw*_P57}XvjaKb^78t_Xc z-9&W+Ar{Galhg?ik=7F)^W+<;GY+pwnf}Boa$15G=N7|Ds_zY^{CYa4IbJUY9!N|-_IrQX!Y~d^PVJo^nL-;h9{*z(!=lBBWY$(??(^2R z$~vYR;UnbxNvGOIbK$95`{aW#%=%P@>#WbmKB3Tdt0wd@5nku>H%xZosWIA=)aT=Z z^%U)QLyK!-_L2K*@#MksS*Ko!X1Qy<9O>1^3qK#-UEv+9vU3Ci`!I(bGD;!+9a9s6 z<<#d+D67p{>Q82ZA%8dqs^N;NRjq6B=Uw6{CcKWn`yXew&2W%OkYdsRVaBldEeIR= V?BPc9PKrkvdNx1?Gnmmh`U9iXtS$fm literal 0 HcmV?d00001 diff --git a/codes/news_recsys/news_rec_server/recprocess/rank/layers/__pycache__/core.cpython-39.pyc b/codes/news_recsys/news_rec_server/recprocess/rank/layers/__pycache__/core.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e863ff4723c67c2787a74ac9d12bc560545fef37 GIT binary patch literal 7751 zcmc&(TZ|i58J=_I;+gT-yV<5mQ+iq2P-+27XiIOT>4k05(qdY&P-;@9UeD~t?%3X( znI-JXTZwL@gomw!)F&QLicrx?Jo3OBJXC!FB;Jtb34sL48xn6xtMGsSjMw(Ay(~h3 zvF4w1zx?OFeCKz)RB|wym1gF4PJFZS=@)J2j{6*4%cnR^&1x^4g_ZiPL6OZco*wgmy*?te|j73kspW zsn@0{ZwC(L9nK%3yc-lLU*!A@<-MRp`4Z=E3&dHiQocqj)2bCSHD9i;g(@}Y!%iHg z&hzcXFbJBR<#gt`deS&w?W%S?YOdFlX19|T=2ds4yP8z=w6q*`RX6cF+#oe;q3XtI zIWJx9G?%ENB2pPWM0C2|NSfd0LHr04HZcG2W96Cn5lV#2ubK=>m$fCq`*mtMr?FSN zW(4J+7^ilghkO+-r(Te;&{y>&Os&;8^cOqnA!6#vYT^%dFyb%_QZrs%36=6_ zm8tFfO&Zttx1E$~qa}5Siq#CrZ6Jq1ZU;F6a+FAFy>+p!mg8-aI=Tyi{wd=>` zpFCF$FUH>t8|U~y;&Y9z3g=eV)G_LYf#Yc+nzY2wPp*5!vHh_#zyStR0$Xi@5@_9N}Gc3CXRKnD6HJJAy(tJxIu+9tK~O{x`vF`?F? z)&;d5m{hNg>1}gwYeH{_YYP)wVl6E<5GS>yl$5WCO?h>yC*IGGykb|c z-K3`>XtVtX9D#u0K;qLxmbISt#;FaAM1qoto&X1~5O~I)CycIPwr#p%!t5rFk95)NR2KA>{3q?*|Qih zf)@k(!;2|nF|-5{`e8Om1>>8UuBVV1GtEON$267cRBv{Y)PT_Nv8AS}cb3Cck3w=v znO?-|%jf{}mpY4`%*~KT>f76l*Xf~wiOrj2#*){A^UG|l$pX8rvrs^-V4X7^bMp+; z>TY!4h0Y@N5)l-|_Yu*AEe{LN@~+zjSNA@#4NKf9UE$rd^?S&yPRn&?M>Q52Fq{1c zQi%m&y!b;RJ+WaVFdiBVD9q#=zuYhry=7blwcxK>g`RmV(+2WkEgQM_- zD7;k6qa*gL4dcR0k4;#WgdDa)WhN{&>OTnKnQXRA1UjytY}elkXX8~B z&L-!>*}+-OF0FP5LUcQ`OXtyIY>IG!Kh91ts+DiDmQeDDDD~d!T|gqP~OzxztESxJ!{!gKS^)1l8x_VaOS>DBev&*f$N`5;H=IBLY_DIMrBk z-G+FeQ^isx%vSD`(^WM^>55p`E&vz62l#&K`hL3`tVT$CzJFo09%VJ`)LA^$o768Y zdsR^qA5I_u%U<*xXH_M)P7nkorZ%}+ZF?3-Ey}69ojKI9qM{hu` zKyQp#hp<~QwO(#^!n)!up61Op`h#L%m`LQI6)~*xUd#qx&@vd883SCVu!oAtE-keg zZjmTtzK+!$aEI#I_``OoWe_%iINVId2L=$$zd>&d9n~^yB-f|1)sF{8fCL4485q?l zL0Uq3O7KXMzUt)qbG=TWw+1Kahz1u*B5u=9-mnW6=Ls$d`LSJC_i{$E`5l3mIDJR~ z+eGXH4i-|-l8gZK4RB9a_W{ATjK?&}>vS#9tJxl3r7m>N?DD=t>7h-*22_k}1LshM zgWus})LkYpze|}}jm}fETn@k4qi!~WTv`Try2uO7itAKO{) z<4BFJ_X^f~ATWOr46B~Q3{QcKnQIH>PZOz>)K`(72RQ^Xz$j;sWdH)$u^Wf7^*v8*n4tJBY13p z#R!QNb3Tp-W2#Br`+#h+rN@D;&^wB#jw+Pj5^Hb1@=7&KhKzlN8XGVd{Rij{X0S+a z3^CjbtR@A+dK3!tDIk_O91?RGdO5qHS21;0RyrZwo^WB}gFArPZ_!HF>{r~(FTaI? z1N5GR*?xijVd7m8E#T}j^XUL?!k~*w<|Y8$_Kj>&>}IUFr+6i~14*%~@%2_7yBc4c zIg6hL1?oL6@|DH5>`5v_!U+1{aF*9I4n6jWqYP~!?q^GsER{sGKwHv8mmYzCXD|=q$keF z4TCVlXr$^RjzQ@?i=s~y*4`+PLn$043vgJYsFu+(uUbS=il`N}!rua0xWzfhQ4EV5 zP0Lk^Xpu*;DTZZG49jNR;fj^(V_4^XT1D{oEfu=5y}^R~k=Ec)g# ztf|ovdp5EtBqod!s_$V)A7qzey-8(%qBn->05{H1#L3$~4&`PdFdqEu zM4~1GABg1#m=h9{hmuO({R-uk!>OBv)PFWdJO#$5~~7Q!#MIE9b;M#{ZkV2Um&$Z*_3mG5M&mzhV|q7dFF$#g&`I*vuCL+ dwNGY|xKo&xucx-SOHe#mdiwOK>HDT1{TB#b%Ypy^ literal 0 HcmV?d00001 diff --git a/codes/news_recsys/news_rec_server/recprocess/rank/layers/__pycache__/embedding.cpython-39.pyc b/codes/news_recsys/news_rec_server/recprocess/rank/layers/__pycache__/embedding.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5ac32f42ea2f1e34fa4b0942350f43cdfe151aa5 GIT binary patch literal 1049 zcmZWoPjAyO6t^AcU)RAlM$sh35vi9Qc0y=ElNdYpupti7P?W~r2z8Urc8ZGJ(?Iz^9;4SNp{&C~7G2}SXYncOuiZfm>SG(f8xJ{SMR@@iBKC9Q42u)M zHv)PKqz1qWG$#ffp^t}i&v>8FoEmD#4ZcNFEF(=o=P?YN0{J>H_zE-f-Jko+GkzKx z-w;C&&}C!@bU_Dq7_#<#8yDrI)+sB-w%dw=mlyhz?Nqg1T6%oht4fQq&_yPSuUy(y zz2q0Q$fVQ1??<-NgrsFQ=9Z55OiA~Dwll5rY?vx=vvKR0lu&1fe8TLPy?EPAs!L+(uBsFA{iNM!W}e^ z3<4jrgG_XV0oky@mC6$hSl`fFbcIim@d0NtkPDn?gM4bUWyE^97l%gHMn>MC`S$=1qH+f7mOiHnFEgR7Qw9?~^4XxUR&Y2$vn zsi>+^J$V6I<(gQIa~p=bO(^aXxeMAspR{v?eR!sMsVaFcsxNyZE;F^)Le{R>?tj*m l)l|)TwXnFfD}A{&kWF*Z!MI|z;BVX$RPHTyao3Bs{sEHL7fb*E literal 0 HcmV?d00001 diff --git a/codes/news_recsys/news_rec_server/recprocess/rank/layers/__pycache__/interaction.cpython-39.pyc b/codes/news_recsys/news_rec_server/recprocess/rank/layers/__pycache__/interaction.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fc6b27e36c9a106a837f4580fd500d5574ab025e GIT binary patch literal 1156 zcmaJ=OK;Oa5Z?9LiAfV4QYi%?;R_OxP=v${AwU9(#3Kiul2vKtdN;+5V~5?f+D7h$ zRP+SDp_RDwFZqPTDL;S<7noTm5H5@~yWZKI*EgS|PNzjcwmz+G{a}QA$HTI?aCiW! zZ@?mmpeYF{>`v;8SjZ@OLIe};ArY=*Qx>{{z9L<32CqrKODperv@cbR$7O+3R`e~{ zqzxH5M`S<+70#g_GW(5zeF*7^#vuv4BjS><0pXifbK}KBm1`Z-c_7>zP^@yR7nbF_ubUF_q7Gk|l=ou2Z>Mrh}?6+>O+>KBd(e z&f_%Fn)6fg^V*|3n;S*;QpTGp_NF(J%t#f*Cdsqjcwc!C?ktvo^$B*uM)}NvC4+!NTU7>$@3cL9qlE$ z@`qB60WrM|W{_|9XE>YBTMmR7)zJLGr*h*cP1hmB9H2G2>iD3O_5vV$0OSX%X3g`k0=o{eq;??v7{oyaj6jY95Erumi4=xl22Do4l?+87VFd9@$;B$hFSWcl uC^b1IH$NpcC%z;>FMbijULuM65E24*dD|Y93S#<6gh~Mbqp92#L}s;Mx)iF#-r`W zh~#WmO3x)Dz`p1|?B24MAea0Z-U0_O_88<61V}bj>>0hBLnyF@MHSiPS6@{z?RG-~ z?WbRE9{#&a$XBRbZ!RdmfUf=wf)h@25;wmzW}vg&DcsnlIOpVE;m3Z_id#hx2X@ZQ z+eH|MMJMi9-OHn*8+Yy4&wIs6ykf_#ykD%wtHoNpM#)En2fTewcw4N`op=NEkas}u zSp5|q@$NZ^H+he*!0#P?m#^}*a~j{}_xL8i1EYJK?vug&7jSAa98lBwjpRZKzBkVF zz%ky(p`AsElSh*uqv|@M$iY$M4zT5uZm$Mi!?W0EM%o%lW1SdaXI>=svb=z zp!=VUOOZ-gu$oB$OA}EX3Z8KIi0Rug2m2(q3o<;*^@iADQ?G##B&M7IrrsCih;oO! z=k3_}g197h;U>Nb_a~`T;sJbT_~Knyk+pE~qT$$s?m$<+22s?gG+eW|N_|r#!Jv;vZ5OfQX zmd00_bMe@?N(ep(ja#OLFyZ4WOAiw@J`<+Bgrtx@Y+}MjAD3foJg!eCVBSpGrJkBj zgp?7Q21$ZuNit~R2l6Ai2Geo_#X5>j6n9Xpf_P3%J4v!URVqoIlYhOv_x{5N)ACc1 zJuE6N@di+n48ILkYn` zo_lykJa>|qFiGGu(;UZ9lAKJ_y!nsa-gFm7%S4_hiF_57H{x43LI&7NzXAirKm^q4 z|0ii&OB%+V8>I0}dry=L&SXov!RElKhANgW-vwns7bFU`qyf5UAP7*wTT}3b8ypLn+t~YUHJ!NldFK(}ksDB5#3P3DSXJ>1dC%%H4 z4(01$@;^btkG=tY2^#7d1+i=x70A2~p{vIrG?|kbM8_Wa>}bX`M2BW`CVzzSQL|$g zy2sgtGjjk4JF`PhLTl%P-Jy2CqIGoB7NhCPHsFqK*KAJb><{!LtT7k4kcM91!7%-( z&r_he#Rs7|d;R0`ahi|$)`Fc|4Li3mKHeEsdJsw^g$c)MTq>QGnXvSb=)kLe8W$kq zrhNcQeu5Mz)Em2puqU1hV(ax&t`pM`b}e{`VPIGm1io-B85 z_W3^iM{R*Xj|McPF8=+;cOn=E%Q5RS`9rYD@K(!)p2BWDg&EFz3NjOTYHQ|z2Oyu_ zBTv~gtTyz5%@B1~JLsR90mf_yxwy$b(>#@@CM;6*2m=3+C^2VYVwOlZ<4?=+$rMnD zgA@Xir#g*S7I$A=T zrO!g(9?TzP=*wlhfdwub5#S>t@!HJQq;@rMO9r*`6hgLkj{%owl#-L6X7YXQ9(%RT z_J9$v7?5#>FJ3}}9O!JdxDXQ@U7ew+v-qx%Sd@&5I~f0HoNFl1cmdf#VLj>M2stZ% zhGGXr8^mDc8il&clfIWoF)~q8_VY#yrjTb2$EmWlJYHGYE@RQM&1~L`&}9ubtBd*N zTLYe7H>dFImbGU6hUp^rEl;*kqvb&n(%WvZ6u)k80QXc_c8JYzA>_T#VJ}@8L1!-+ z3u&MA=muTGtoK_)g&shOME1)MKpa@=5AMr7P)rD~P*usJy!xF56C*FrMW&(Hs-_Cu zxiZ;)BxI_#^Ym0m)l`E^Uzk_FVTbRjF%+6Jw8q$y=-~LhZx}WkZi^>Srm)oOG~b>` zF_Bd!6x5(mlSVrWyR4GJIy~+#SK><;Gv5_ej9(od!$q3Pg^1Z2V6$ICBLwFf-j{C$ i6YMl^t6e0O!pB8G0~P_OeHy`U-|2Vz?O;6!!|1<8bQ^^L literal 0 HcmV?d00001 diff --git a/codes/news_recsys/news_rec_server/recprocess/rank/model_tools/feature_columns.py b/codes/news_recsys/news_rec_server/recprocess/rank/model_tools/feature_columns.py new file mode 100644 index 00000000..2ff1d919 --- /dev/null +++ b/codes/news_recsys/news_rec_server/recprocess/rank/model_tools/feature_columns.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# @File : feature_columns.py +# @Author: xLyons +# @IDE :PyCharm +# @Time : 2022/1/27 + + +import copy +import tensorflow as tf + +from collections import OrderedDict +from tensorflow.keras.layers import Input, Flatten, Concatenate +from tensorflow.keras.initializers import RandomNormal, Zeros +from tensorflow.keras.layers.experimental.preprocessing import StringLookup + +from layers.core import Linear +from layers.embedding import create_embed_dict, embedding_lookup + + +class SparseFeat(object): + def __init__(self, name, embed_dim, vocab_size, dtype, embed_name=None, seed=48): + self.name = name + self.vocab_size = vocab_size + self.embed_dim = embed_dim + self.embed_init = RandomNormal(mean=0.0, stddev=0.01, seed=seed) + self.dtype = dtype + + self.embed_name = embed_name if embed_name else name + + super(SparseFeat, self).__init__() + + +class DenseFeat(object): + def __init__(self, name, dimension, dtype=None): + self.name = name + self.dimension = dimension + self.dtype = dtype + + super(DenseFeat, self).__init__() + + +def build_feature_inputs(feature_columns): + feat_inputs = OrderedDict() + for feat in feature_columns: + if isinstance(feat, SparseFeat): + sparse_inputs = Input(shape=(1, ), + name=feat.name, + dtype=feat.dtype) + feat_inputs[feat.name] = sparse_inputs + elif isinstance(feat, DenseFeat): + dense_inputs = Input(shape=(feat.dimension, ), + name=feat.name, + dtype=feat.dtype) + feat_inputs[feat.name] = dense_inputs + else: + raise TypeError("Invalid feature column type,got", type(feat)) + + return feat_inputs + + +def build_feature_coding_model(all_data, sparse_features): + feature_vocab_dict = dict() + for feat in sparse_features: + string_model = StringLookup(vocabulary=all_data[feat].unique(), + mask_token=None) + feature_vocab_dict[feat] = string_model + + return feature_vocab_dict + + +def get_dense_inputs(feat_inputs, feature_columns, concat_flag=True): + dense_inputs = [] + for feat in feature_columns: + if isinstance(feat, DenseFeat): + dense_inputs.append(feat_inputs[feat.name]) + + if concat_flag: + dense_inputs = tf.concat(dense_inputs, axis=-1) + + return dense_inputs + + +def get_linear_logit(feat_inputs, feature_columns, linear_l2_reg=.0, embed_l2_reg=1e-5, use_bias=True, seed=48,): + linear_features = copy.deepcopy(feature_columns) + for feat in linear_features: + if isinstance(feat, SparseFeat): + feat.embed_dim = 1 + feat.embed_init = Zeros() + + sparse_feature_columns = list( + filter(lambda x: isinstance(x, SparseFeat), linear_features)) if feature_columns else [] + sparse_embed_dict = create_embed_dict(sparse_feature_columns, embed_l2_reg) + sparse_embed_list = embedding_lookup(sparse_embed_dict, feat_inputs, sparse_feature_columns, to_list=True) + + dense_inputs = get_dense_inputs(feat_inputs, linear_features, concat_flag=True) + sparse_embed_inputs = Flatten()(Concatenate(axis=-1)(sparse_embed_list)) + linear_inputs = tf.concat([dense_inputs, sparse_embed_inputs], axis=-1) + + linear_logit = Linear(linear_l2_reg, use_bias, seed)(linear_inputs) + + return linear_logit diff --git a/codes/news_recsys/news_rec_server/recprocess/rank/models/__init__.py b/codes/news_recsys/news_rec_server/recprocess/rank/models/__init__.py new file mode 100644 index 00000000..0cb6e9d2 --- /dev/null +++ b/codes/news_recsys/news_rec_server/recprocess/rank/models/__init__.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- +# @File : __init__.py.py +# @Author: xLyons +# @IDE : PyCharm +# @Time : 2022/1/27 diff --git a/codes/news_recsys/news_rec_server/recprocess/rank/models/__pycache__/__init__.cpython-39.pyc b/codes/news_recsys/news_rec_server/recprocess/rank/models/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..167ba48dcf557f55aa5cded46a0ff0380beabd53 GIT binary patch literal 119 zcmYe~<>g`k0=o{eq;??v7{oyaj6jY95Erumi4=xl22Do4l?+87VFd9@#>Fbetu!wv oH900XKP5G%I3_+mGcU6wK3=b&@)m~;P_Q&7)efZVGY~TX05ka)qW}N^ literal 0 HcmV?d00001 diff --git a/codes/news_recsys/news_rec_server/recprocess/rank/models/__pycache__/deepfm.cpython-39.pyc b/codes/news_recsys/news_rec_server/recprocess/rank/models/__pycache__/deepfm.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..11c8aaac891d5f3be2f607c11a5d53519b9304e6 GIT binary patch literal 1726 zcmZ8iOK%%D5GJ_~y;e`l541q<%0UQvYhV}!Z0KekYM&Mc7Hg4G)}eLJzg=4g&%j4dXd%)Qui*yH{@hy#ax9$q5aq753o#c_o;FVLWU3+K@Qo8XM) zk^%6}#$$L?M(3xZkP;w%^6~fQAAb7vuis8CKU#A;M#dM6FK?d$p14@uHbO`6w9%cy zlp#h)A&u5}RaaQo-;wtgH#8DY6jmO^l)SZivc@@?p;e@O9ci*AtELKcQ`dDwJ?d}# zRZE2+A&`QNw`!}3m9(dc_H~Vhn{d@pRa&9d4c0BVqpNCbq`QE6s=h{`tL=2Kx2GC6 zk#6g~8FBk$)mKf8byWvC)D>Fa1iBA)`>F*#8(`hO>D+YTrf7p{bK~iKr)}Ky-tFm* z?j0h%e~8}W)q%6^-|Xv#KA^1)Ssm&_%k+2xWZVynJ(w3TrRC4kr0|-6DIO1qsb@0F zrAqS&i|ZGQR7f^rsj~HW^$QXa9QpLs*|V$BGXH^1uI2@0TwYPe7Ss7~@!C|6c{;zQ z>B%=>`3Fq&=I9Kxqx0c0FD5CMCjd&5OIvVNBs`OFLEM-W`6N}zl&7;nXc|+vH_7tF zQc2Uc!cwqgQt;(Gm!`?HoTVb+-zI|1OvL8bjP9iBQnKVaOQrE8WAxT$4R`tJt4!YB z!59Q4dY)0r@?@E3;J=7o)`OHTHEDo$Y&jjwM*ayk< zUNDg}4&i1hOL_K&2_Q4vMqdFhHjTkS+`nFCoF=<;wg$eXz=SVTzGTvb(~K)7;_gI1 z)+{M=quE56j>9y|X9+Kg7t2N5nK6|>B9L+EtlZhrQ_M1D8mEp~mV-Cp+juJ@o+`z1 zL)6rSCGo;Gh9FiQKsvx)^dKE+S~FgVLRrS9g_s+X=cXdF*}TZ83Cm6}!MW{-!I9`$ zKOLIq$wQ(=-`=qYOWfXuxT{rMx5iuUn-*LCOY)tnEI{0{sxReo)ev2KTcaev^;3Kx zUGv^Ey=|PhyWRgn>i@P}DWXS=Mi1GHYt#9T>MB5PDUSCZx!so4g}tpx&1L7x(VS*^ z%9CwVn7~ozUafLz+GN7=t!V?6V^~n5iX)HQ`8a|}?ub{%9nvCzcfZeJrG@LHk9+uj z_isNSpZ(Cvt$%%w{No3pC*nKsI383*2v4_MmaqcyNZ2o_fIk2(8 zZ87BOYbK=Yl(?ea`iQE)ljy#U*?xOg5=V81iM c@lZX6V@u9XY)sOA<|7i}2v-AyAN~IP4;8KZMgRZ+ literal 0 HcmV?d00001 diff --git a/codes/news_recsys/news_rec_server/recprocess/rank/models/__pycache__/deepfm_ppnet.cpython-39.pyc b/codes/news_recsys/news_rec_server/recprocess/rank/models/__pycache__/deepfm_ppnet.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..46365be3fbb38177bf56496056ed83bacf3aebcf GIT binary patch literal 2042 zcmZWqPjA~c6enp}R%}^z5-0Be4ZFFAVAy3qQ3M4V_|~9Uau|dHf@0FHQd%-dx@!yi z@8&1mN6*D(^r;5yc!$ag4FG2`AH9Tyu7fd((PccecmrkpPzpX_9%2+ zWpfigHoqEF`8`HRA&r)J5h|?1*W``0+Zu`63ac8$l)SX#WQlV!L5oOvI?~>fEIO*L zJ37=6tx<2~ExO7F4}n)-)fStoVLe^fp7wQ%`YV62r2^WZ!3ygx?6IvjmPl`d)l#J|96=|$%x~UsF&>OV1^7XF1I)PIzuBi@W8N&YE)6LT@$i2P7G+Nd4H8+7_$B8waP2C&yX909x^i?uE2M^L5Jj|E=gB{@Q8_;5f> zGm}{^Rho}j+&r44Lb4%C)fq7S>% zH+yb^yF8tq&~*PRi2ofbdU5LjVulYAr=7jKycnfi?t@VV9$Pu7BH@_?WyP&gk&jZ9 zjCnd4_@*_69Fi=b&6PAA>&yj9Mg^Zwb7|T<%ULQC{!Jp-#Dr{m!syxy=8`2RSt^Yu z8KdVG0ocn+pJnp=1C;;aLm*-ss>4U4Q}hZw1;B)b%g)TxDHB%B07~SJoKzV}hmYvM znCL?Dtu;2Gfg?1_NSVE0CVZSx%JO8MXK&~{R*uHtYTps^yA*iJmYko>>5tbLT>!0 zDW5ZG{Bgz=6Y=Iq0Lm;Wv835ZnXa>GmQNC16i?=}xI1Agfr9|T| zm}EJuCOn&OW5`pbSZ;_K8^0vlw52L$$wZ_yW4RJmqrwtl>To(BFT8L%X`%@)M4>Ez z<3dag$#c_?*<@N|)cB<+P5prt=isL3*~PxF%`LlPm*#nLkpZ!5$9ABJJ5}y$(Zo#~ zT#?eYwk4)y#SN9r#E~8P#G|80SP_|4iHs9=yer*}`ba^g;CQp@|Ah8uU5=yh64c>E znBxt&eia7+nA~C?Z(mxr5`;^;3f;LR-}QxxyJdHEk3b%ASoT!YHXJC9-u5`lQ=U{J zGj)f3c@69%@9u&LFW!B7F(icsMF9?L