@@ -0,0 +1,450 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "b06bec9d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import gc\n",
|
||||
"import os\n",
|
||||
"import math\n",
|
||||
"import pickle\n",
|
||||
"\n",
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"from tqdm.auto import tqdm\n",
|
||||
"from operator import itemgetter\n",
|
||||
"\n",
|
||||
"from sklearn.utils import shuffle\n",
|
||||
"from collections import defaultdict\n",
|
||||
"from metric import PrintMetric\n",
|
||||
"\n",
|
||||
"import warnings\n",
|
||||
"warnings.filterwarnings(\"ignore\")\n",
|
||||
"\n",
|
||||
"raw_data_path = 'D:/news-rec/dataset/raw_data'\n",
|
||||
"new_data_path = 'D:/news-rec/dataset/recall_data'\n",
|
||||
"\n",
|
||||
"os.makedirs(new_data_path, exist_ok=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "4479018f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": " user_id 设备名称 操作系统 所在省 所在市 \\\n0 1000372820 TAS-AN00 Android 广东 广州 \n10 1001440812 iPad IOS NaN NaN \n16 1001771644 V1901A Android 陕西 宝鸡 \n17 1001773994 STK-AL00 Android 广东 河源 \n142 1017050854 DUB-AL00 Android 湖北 武汉 \n\n 年龄 \\\n0 A_0_24:0.404616,A_25_29:0.059027,A_30_39:0.516... \n10 A_0_24:0.312738,A_25_29:0.261741,A_30_39:0.268... \n16 A_0_24:0.445645,A_25_29:0.330315,A_30_39:0.153... \n17 A_0_24:0.497841,A_25_29:0.245965,A_30_39:0.219... \n142 A_0_24:0.008895,A_25_29:0.067247,A_30_39:0.824... \n\n 性别 \n0 female:0.051339,male:0.948661 \n10 female:0.907997,male:0.092003 \n16 female:0.049787,male:0.950213 \n17 female:0.117317,male:0.882683 \n142 female:0.519291,male:0.480709 ",
|
||||
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>user_id</th>\n <th>设备名称</th>\n <th>操作系统</th>\n <th>所在省</th>\n <th>所在市</th>\n <th>年龄</th>\n <th>性别</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1000372820</td>\n <td>TAS-AN00</td>\n <td>Android</td>\n <td>广东</td>\n <td>广州</td>\n <td>A_0_24:0.404616,A_25_29:0.059027,A_30_39:0.516...</td>\n <td>female:0.051339,male:0.948661</td>\n </tr>\n <tr>\n <th>10</th>\n <td>1001440812</td>\n <td>iPad</td>\n <td>IOS</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>A_0_24:0.312738,A_25_29:0.261741,A_30_39:0.268...</td>\n <td>female:0.907997,male:0.092003</td>\n </tr>\n <tr>\n <th>16</th>\n <td>1001771644</td>\n <td>V1901A</td>\n <td>Android</td>\n <td>陕西</td>\n <td>宝鸡</td>\n <td>A_0_24:0.445645,A_25_29:0.330315,A_30_39:0.153...</td>\n <td>female:0.049787,male:0.950213</td>\n </tr>\n <tr>\n <th>17</th>\n <td>1001773994</td>\n <td>STK-AL00</td>\n <td>Android</td>\n <td>广东</td>\n <td>河源</td>\n <td>A_0_24:0.497841,A_25_29:0.245965,A_30_39:0.219...</td>\n <td>female:0.117317,male:0.882683</td>\n </tr>\n <tr>\n <th>142</th>\n <td>1017050854</td>\n <td>DUB-AL00</td>\n <td>Android</td>\n <td>湖北</td>\n <td>武汉</td>\n <td>A_0_24:0.008895,A_25_29:0.067247,A_30_39:0.824...</td>\n <td>female:0.519291,male:0.480709</td>\n </tr>\n </tbody>\n</table>\n</div>"
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"user_info = pd.read_csv(raw_data_path + '/user_info_5w.csv', sep='\\t', index_col=0)\n",
|
||||
"user_info.columns = [\"user_id\", \"设备名称\", \"操作系统\", \"所在省\", \"所在市\", \"年龄\",\"性别\"]\n",
|
||||
"\n",
|
||||
"user_info.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "22d466d5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": " item_id 标题 发文时间 图片数量 一级分类 \\\n0 361653323 疫情谣言粉碎机丨接种新冠疫苗后用麻药或致死?盘点最新疫情谣言,别被忽悠了 1624522285000 1 健康 \n1 426732705 实拍本田飞度:空间真大,8万出头工薪族可选,但内饰能忍? 1610808303000 9 汽车 \n2 430221183 搭载135kw电机比亚迪秦plus纯电动版外观更精致 1612581556000 2 汽车 \n3 441756326 【提车作业】不顾他人眼光帕萨特phev俘获30老男人浪子心 1618825835000 23 汽车 \n4 443485341 魏延有反骨之心都能重用,赵云忠心为什么却不被重用? 1619484501000 4 历史 \n\n 二级分类 关键词 \n0 健康/疾病防护治疗及西医用药 医生:14.760494,吸烟:16.474872,板蓝根:15.597788,板蓝根^^熏... \n1 汽车/买车 155n:8.979802,polo:7.951116,中控台:5.954278,中网:7.... \n2 汽车/买车 etc:12.055207,代表:8.878175,内饰:5.342025,刀片:9.453... \n3 汽车/买车 丰田凯美瑞:12.772149,充电器:8.394001,品牌:8.436843,城市:7.... \n4 历史/中国史 三国:8.979797,五虎将:13.072728,人才:7.532783,保镖:6.811... ",
|
||||
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>item_id</th>\n <th>标题</th>\n <th>发文时间</th>\n <th>图片数量</th>\n <th>一级分类</th>\n <th>二级分类</th>\n <th>关键词</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>361653323</td>\n <td>疫情谣言粉碎机丨接种新冠疫苗后用麻药或致死?盘点最新疫情谣言,别被忽悠了</td>\n <td>1624522285000</td>\n <td>1</td>\n <td>健康</td>\n <td>健康/疾病防护治疗及西医用药</td>\n <td>医生:14.760494,吸烟:16.474872,板蓝根:15.597788,板蓝根^^熏...</td>\n </tr>\n <tr>\n <th>1</th>\n <td>426732705</td>\n <td>实拍本田飞度:空间真大,8万出头工薪族可选,但内饰能忍?</td>\n <td>1610808303000</td>\n <td>9</td>\n <td>汽车</td>\n <td>汽车/买车</td>\n <td>155n:8.979802,polo:7.951116,中控台:5.954278,中网:7....</td>\n </tr>\n <tr>\n <th>2</th>\n <td>430221183</td>\n <td>搭载135kw电机比亚迪秦plus纯电动版外观更精致</td>\n <td>1612581556000</td>\n <td>2</td>\n <td>汽车</td>\n <td>汽车/买车</td>\n <td>etc:12.055207,代表:8.878175,内饰:5.342025,刀片:9.453...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>441756326</td>\n <td>【提车作业】不顾他人眼光帕萨特phev俘获30老男人浪子心</td>\n <td>1618825835000</td>\n <td>23</td>\n <td>汽车</td>\n <td>汽车/买车</td>\n <td>丰田凯美瑞:12.772149,充电器:8.394001,品牌:8.436843,城市:7....</td>\n </tr>\n <tr>\n <th>4</th>\n <td>443485341</td>\n <td>魏延有反骨之心都能重用,赵云忠心为什么却不被重用?</td>\n <td>1619484501000</td>\n <td>4</td>\n <td>历史</td>\n <td>历史/中国史</td>\n <td>三国:8.979797,五虎将:13.072728,人才:7.532783,保镖:6.811...</td>\n </tr>\n </tbody>\n</table>\n</div>"
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"doc_info = pd.read_table(raw_data_path + '/doc_info.txt', sep='\\t')\n",
|
||||
"doc_info.columns = [\"item_id\", \"标题\", \"发文时间\", \"图片数量\", \"一级分类\", \"二级分类\", \"关键词\"]\n",
|
||||
"\n",
|
||||
"item2cate = dict(zip(doc_info['item_id'], doc_info['一级分类']))\n",
|
||||
"doc_info.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "7cf3ff94",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": " user_id item_id 展现时间 网路环境 刷新次数 展现位置 是否点击 消费时长(秒)\n0 1000014754 463510256 1624843756147 5 0 16 0 0\n1 1000014754 463852707 1624843756147 5 0 13 1 80\n2 1000014754 464757134 1625052999841 5 0 13 1 1050\n3 1000014754 464617167 1625052999841 5 0 16 1 286\n4 1000014754 465426190 1625382421168 5 0 5 0 0",
|
||||
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>user_id</th>\n <th>item_id</th>\n <th>展现时间</th>\n <th>网路环境</th>\n <th>刷新次数</th>\n <th>展现位置</th>\n <th>是否点击</th>\n <th>消费时长(秒)</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1000014754</td>\n <td>463510256</td>\n <td>1624843756147</td>\n <td>5</td>\n <td>0</td>\n <td>16</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1000014754</td>\n <td>463852707</td>\n <td>1624843756147</td>\n <td>5</td>\n <td>0</td>\n <td>13</td>\n <td>1</td>\n <td>80</td>\n </tr>\n <tr>\n <th>2</th>\n <td>1000014754</td>\n <td>464757134</td>\n <td>1625052999841</td>\n <td>5</td>\n <td>0</td>\n <td>13</td>\n <td>1</td>\n <td>1050</td>\n </tr>\n <tr>\n <th>3</th>\n <td>1000014754</td>\n <td>464617167</td>\n <td>1625052999841</td>\n <td>5</td>\n <td>0</td>\n <td>16</td>\n <td>1</td>\n <td>286</td>\n </tr>\n <tr>\n <th>4</th>\n <td>1000014754</td>\n <td>465426190</td>\n <td>1625382421168</td>\n <td>5</td>\n <td>0</td>\n <td>5</td>\n <td>0</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n</div>"
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"all_data = pd.read_csv(raw_data_path + '/train_data_5w.csv', sep='\\t', index_col=0)\n",
|
||||
"all_data.columns = [\"user_id\", \"item_id\", \"展现时间\", \"网路环境\", \"刷新次数\", \"展现位置\", \"是否点击\", \"消费时长(秒)\"]\n",
|
||||
"\n",
|
||||
"all_data.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": " user_id item_id 展现时间 网路环境 刷新次数 展现位置 是否点击 消费时长(秒) \\\n0 1000014754 463510256 2021-06-28 01:29:16 5 0 16 0 0 \n1 1000014754 463852707 2021-06-28 01:29:16 5 0 13 1 80 \n2 1000014754 464757134 2021-06-30 11:36:39 5 0 13 1 1050 \n3 1000014754 464617167 2021-06-30 11:36:39 5 0 16 1 286 \n4 1000014754 465426190 2021-07-04 07:07:01 5 0 5 0 0 \n\n 展现时间_日期 \n0 28 \n1 28 \n2 30 \n3 30 \n4 4 ",
|
||||
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>user_id</th>\n <th>item_id</th>\n <th>展现时间</th>\n <th>网路环境</th>\n <th>刷新次数</th>\n <th>展现位置</th>\n <th>是否点击</th>\n <th>消费时长(秒)</th>\n <th>展现时间_日期</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1000014754</td>\n <td>463510256</td>\n <td>2021-06-28 01:29:16</td>\n <td>5</td>\n <td>0</td>\n <td>16</td>\n <td>0</td>\n <td>0</td>\n <td>28</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1000014754</td>\n <td>463852707</td>\n <td>2021-06-28 01:29:16</td>\n <td>5</td>\n <td>0</td>\n <td>13</td>\n <td>1</td>\n <td>80</td>\n <td>28</td>\n </tr>\n <tr>\n <th>2</th>\n <td>1000014754</td>\n <td>464757134</td>\n <td>2021-06-30 11:36:39</td>\n <td>5</td>\n <td>0</td>\n <td>13</td>\n <td>1</td>\n <td>1050</td>\n <td>30</td>\n </tr>\n <tr>\n <th>3</th>\n <td>1000014754</td>\n <td>464617167</td>\n <td>2021-06-30 11:36:39</td>\n <td>5</td>\n <td>0</td>\n <td>16</td>\n <td>1</td>\n <td>286</td>\n <td>30</td>\n </tr>\n <tr>\n <th>4</th>\n <td>1000014754</td>\n <td>465426190</td>\n <td>2021-07-04 07:07:01</td>\n <td>5</td>\n <td>0</td>\n <td>5</td>\n <td>0</td>\n <td>0</td>\n <td>4</td>\n </tr>\n </tbody>\n</table>\n</div>"
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"all_data['展现时间'] = all_data['展现时间'].astype('str')\n",
|
||||
"all_data['展现时间'] = all_data['展现时间'].apply(lambda x: int(x[:10]))\n",
|
||||
"\n",
|
||||
"all_data['展现时间'] = pd.to_datetime(all_data['展现时间'], unit='s', errors='coerce')\n",
|
||||
"all_data['展现时间_日期'] = all_data['展现时间'].dt.day\n",
|
||||
"\n",
|
||||
"all_data.head()"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "73c9843e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": "179"
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"mode = 'debug'\n",
|
||||
"\n",
|
||||
"if mode == 'debug':\n",
|
||||
" all_data = shuffle(all_data)\n",
|
||||
" all_data.reset_index(drop=True)\n",
|
||||
"\n",
|
||||
" train_data = all_data[(all_data['展现时间_日期'] >= 5) & (all_data['展现时间_日期'] < 6)]\n",
|
||||
" test_data = all_data.loc[all_data['展现时间_日期'] == 6, :]\n",
|
||||
"else:\n",
|
||||
" train_data = all_data[(all_data['展现时间_日期'] >= 1) & (all_data['展现时间_日期'] < 6)]\n",
|
||||
" test_data = all_data.loc[all_data['展现时间_日期'] == 6, :]\n",
|
||||
"\n",
|
||||
"# 训练集按照时间排序\n",
|
||||
"train_data.sort_values('展现时间', inplace=True)\n",
|
||||
"\n",
|
||||
"del all_data, doc_info, user_info\n",
|
||||
"gc.collect()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"15655\n",
|
||||
"33664\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(train_data['user_id'].nunique())\n",
|
||||
"print(train_data['item_id'].nunique())"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class ItemCF(object):\n",
|
||||
" def __init__(self, his_data, item2cate):\n",
|
||||
" self.user_set = set()\n",
|
||||
" self.his_data = his_data\n",
|
||||
" self.item2cate = item2cate\n",
|
||||
"\n",
|
||||
" self.item_sim_matrix = dict()\n",
|
||||
" self.item_interacted_num = defaultdict(int)\n",
|
||||
"\n",
|
||||
" def calculate_similarity_matrix(self):\n",
|
||||
" # his_data已经按照时间排序....\n",
|
||||
" user2items = self.his_data.groupby('user_id')['item_id'].apply(list).reset_index()\n",
|
||||
" # print(f'计算ItemCF第一阶段...')\n",
|
||||
" pbar = tqdm(total=user2items.shape[0])\n",
|
||||
" for idx, row in user2items.iterrows():\n",
|
||||
" self.user_set.add(row['user_id'])\n",
|
||||
" for idx1, item_1 in enumerate(row['item_id']):\n",
|
||||
" self.item_interacted_num[item_1] += 1\n",
|
||||
" self.item_sim_matrix.setdefault(item_1, {})\n",
|
||||
" for idx2, item_2 in enumerate(row['item_id']):\n",
|
||||
" if item_1 == item_2:\n",
|
||||
" continue\n",
|
||||
" self.item_sim_matrix[item_1].setdefault(item_2, 0)\n",
|
||||
" # 新闻阅读可能具有连续性,后续阅读的新闻与前面阅读的新闻相似度更高\n",
|
||||
" related_score = 1 if idx1 > idx2 else 0.8\n",
|
||||
" # 如果二者类别相同,新闻之间的相似度更高\n",
|
||||
" related_score *= 1 if item2cate.get(item_1, None) == item2cate.get(item_2, None) else 0.5\n",
|
||||
"\n",
|
||||
" # 活跃用户在计算物品之间相似度时,贡献小于非活跃用户\n",
|
||||
" self.item_sim_matrix[item_1][item_2] += related_score / math.log(1 + len(row['item_id']))\n",
|
||||
" pbar.update(1)\n",
|
||||
" pbar.close()\n",
|
||||
"\n",
|
||||
" # 理论上,物品之间共现的用户越多,相似度越高\n",
|
||||
" # 但是,热门物品与很多物品之间的相似度都很高\n",
|
||||
" # print(f'计算ItemCF第二阶段...')\n",
|
||||
" for item_1, related_items in tqdm(self.item_sim_matrix.items()):\n",
|
||||
" for item_2, weight in related_items.items():\n",
|
||||
" # 打压热门物品\n",
|
||||
" self.item_sim_matrix[item_1][item_2] = \\\n",
|
||||
" weight / math.sqrt(self.item_interacted_num[item_1] * self.item_interacted_num[item_2])\n",
|
||||
"\n",
|
||||
" def __call__(self, users, _n=50, _topk=20):\n",
|
||||
" print(f'开始ItemCF召回: Recall@{topk}-Near@{_n}')\n",
|
||||
" user2items = self.his_data.groupby('user_id')['item_id'].apply(list)\n",
|
||||
" popular_items = [val[0] for val in sorted(\n",
|
||||
" self.item_interacted_num.items(), key=lambda x: x[1], reverse=True)[:_topk]]\n",
|
||||
"\n",
|
||||
" user_rec = {}\n",
|
||||
" for user_id in tqdm(users):\n",
|
||||
" # 新用户,直接推荐热门物品\n",
|
||||
" if user_id not in self.user_set:\n",
|
||||
" user_rec[user_id] = popular_items\n",
|
||||
" else:\n",
|
||||
" rank = defaultdict(int)\n",
|
||||
" his_items = user2items.loc[user_id]\n",
|
||||
" # 遍历用户历史交互物品\n",
|
||||
" for his_item in his_items:\n",
|
||||
" # 选取与his_item相似度最高的_n个物品\n",
|
||||
" for candidate_item, item_smi_score in sorted(self.item_sim_matrix[his_item].items(),\n",
|
||||
" key=itemgetter(1), reverse=True)[:_n]:\n",
|
||||
" # 如果推荐的物品已经被购买过了,是否纳入推荐(可跳过)\n",
|
||||
" # if candidate_item in his_items:\n",
|
||||
" # continue\n",
|
||||
" rank[candidate_item] += item_smi_score\n",
|
||||
" rec_items = [item[0] for item in sorted(rank.items(), key=itemgetter(1), reverse=True)[:_topk]]\n",
|
||||
" # 如果推荐的物品不够,用热门物品进行填充\n",
|
||||
" rec_items += popular_items[:topk-len(rec_items)]\n",
|
||||
" user_rec[user_id] = rec_items\n",
|
||||
"\n",
|
||||
" return user_rec"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"icf_cls_path = os.path.join(new_data_path, 'item_cf')\n",
|
||||
"os.makedirs(icf_cls_path, exist_ok=True)"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": " 0%| | 0/15655 [00:00<?, ?it/s]",
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"version_major": 2,
|
||||
"version_minor": 0,
|
||||
"model_id": "d5c3c1e99e764b40a4dac12d60bc4b77"
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": " 0%| | 0/33664 [00:00<?, ?it/s]",
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"version_major": 2,
|
||||
"version_minor": 0,
|
||||
"model_id": "9011a3e044c948f9850a19db02e464bf"
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"demo_icf_path = os.path.join(icf_cls_path, mode+'_ifc.pkl')\n",
|
||||
"if os.path.exists(demo_icf_path):\n",
|
||||
" with open(demo_icf_path, 'rb') as file:\n",
|
||||
" demo_icf = pickle.loads(file.read())\n",
|
||||
" file.close()\n",
|
||||
"else:\n",
|
||||
" demo_icf = ItemCF(train_data, item2cate)\n",
|
||||
" demo_icf.calculate_similarity_matrix()\n",
|
||||
" demo_icf_pkl = pickle.dumps(demo_icf)\n",
|
||||
"\n",
|
||||
" output_icf = open(demo_icf_path, 'wb')\n",
|
||||
" output_icf.write(demo_icf_pkl)\n",
|
||||
" output_icf.close()"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"开始ItemCF召回: Recall@100-Near@50\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": " 0%| | 0/13792 [00:00<?, ?it/s]",
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"version_major": 2,
|
||||
"version_minor": 0,
|
||||
"model_id": "afe3cbc4db0f423c9a62c7bede9befa2"
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"n, topk = 50, 100\n",
|
||||
"\n",
|
||||
"# 召回\n",
|
||||
"test_users = test_data['user_id'].unique()\n",
|
||||
"icf_rec_result = demo_icf(test_users, n, topk)\n",
|
||||
"\n",
|
||||
"test_user_group = test_data.groupby('user_id')['item_id'].agg(list).reset_index()\n",
|
||||
"test_pred = [icf_rec_result[user_id] for user_id in test_user_group['user_id']]\n",
|
||||
"test_true = test_user_group['item_id'].to_list()"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"MAP@100: 0.016906571748779006\n",
|
||||
"Recall@100: 0.15798311228206416\n",
|
||||
"Precision@100: 0.027745069605568447\n",
|
||||
"F1@100: 0.03914852311427278\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"PrintMetric(test_true, test_pred, topk)"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
2063
codes/news_recsys/news_rec_server/recprocess/recall/i2i_pop.ipynb
Normal file
2063
codes/news_recsys/news_rec_server/recprocess/recall/i2i_pop.ipynb
Normal file
File diff suppressed because one or more lines are too long
@@ -0,0 +1,413 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "b06bec9d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import gc\n",
|
||||
"import os\n",
|
||||
"import math\n",
|
||||
"import pickle\n",
|
||||
"\n",
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"from tqdm.auto import tqdm\n",
|
||||
"from operator import itemgetter\n",
|
||||
"\n",
|
||||
"from sklearn.utils import shuffle\n",
|
||||
"from collections import defaultdict\n",
|
||||
"\n",
|
||||
"from metric import PrintMetric\n",
|
||||
"\n",
|
||||
"import warnings\n",
|
||||
"warnings.filterwarnings(\"ignore\")\n",
|
||||
"\n",
|
||||
"raw_data_path = 'D:/news-rec/dataset/raw_data'\n",
|
||||
"new_data_path = 'D:/news-rec/dataset/recall_data'\n",
|
||||
"\n",
|
||||
"os.makedirs(new_data_path, exist_ok=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "4479018f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": " user_id 设备名称 操作系统 所在省 所在市 \\\n0 1000372820 TAS-AN00 Android 广东 广州 \n10 1001440812 iPad IOS NaN NaN \n16 1001771644 V1901A Android 陕西 宝鸡 \n17 1001773994 STK-AL00 Android 广东 河源 \n142 1017050854 DUB-AL00 Android 湖北 武汉 \n\n 年龄 \\\n0 A_0_24:0.404616,A_25_29:0.059027,A_30_39:0.516... \n10 A_0_24:0.312738,A_25_29:0.261741,A_30_39:0.268... \n16 A_0_24:0.445645,A_25_29:0.330315,A_30_39:0.153... \n17 A_0_24:0.497841,A_25_29:0.245965,A_30_39:0.219... \n142 A_0_24:0.008895,A_25_29:0.067247,A_30_39:0.824... \n\n 性别 \n0 female:0.051339,male:0.948661 \n10 female:0.907997,male:0.092003 \n16 female:0.049787,male:0.950213 \n17 female:0.117317,male:0.882683 \n142 female:0.519291,male:0.480709 ",
|
||||
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>user_id</th>\n <th>设备名称</th>\n <th>操作系统</th>\n <th>所在省</th>\n <th>所在市</th>\n <th>年龄</th>\n <th>性别</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1000372820</td>\n <td>TAS-AN00</td>\n <td>Android</td>\n <td>广东</td>\n <td>广州</td>\n <td>A_0_24:0.404616,A_25_29:0.059027,A_30_39:0.516...</td>\n <td>female:0.051339,male:0.948661</td>\n </tr>\n <tr>\n <th>10</th>\n <td>1001440812</td>\n <td>iPad</td>\n <td>IOS</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>A_0_24:0.312738,A_25_29:0.261741,A_30_39:0.268...</td>\n <td>female:0.907997,male:0.092003</td>\n </tr>\n <tr>\n <th>16</th>\n <td>1001771644</td>\n <td>V1901A</td>\n <td>Android</td>\n <td>陕西</td>\n <td>宝鸡</td>\n <td>A_0_24:0.445645,A_25_29:0.330315,A_30_39:0.153...</td>\n <td>female:0.049787,male:0.950213</td>\n </tr>\n <tr>\n <th>17</th>\n <td>1001773994</td>\n <td>STK-AL00</td>\n <td>Android</td>\n <td>广东</td>\n <td>河源</td>\n <td>A_0_24:0.497841,A_25_29:0.245965,A_30_39:0.219...</td>\n <td>female:0.117317,male:0.882683</td>\n </tr>\n <tr>\n <th>142</th>\n <td>1017050854</td>\n <td>DUB-AL00</td>\n <td>Android</td>\n <td>湖北</td>\n <td>武汉</td>\n <td>A_0_24:0.008895,A_25_29:0.067247,A_30_39:0.824...</td>\n <td>female:0.519291,male:0.480709</td>\n </tr>\n </tbody>\n</table>\n</div>"
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"user_info = pd.read_csv(raw_data_path + '/user_info_5w.csv', sep='\\t', index_col=0)\n",
|
||||
"user_info.columns = [\"user_id\", \"设备名称\", \"操作系统\", \"所在省\", \"所在市\", \"年龄\",\"性别\"]\n",
|
||||
"\n",
|
||||
"user_info.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "22d466d5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": " item_id 标题 发文时间 图片数量 一级分类 \\\n0 361653323 疫情谣言粉碎机丨接种新冠疫苗后用麻药或致死?盘点最新疫情谣言,别被忽悠了 1624522285000 1 健康 \n1 426732705 实拍本田飞度:空间真大,8万出头工薪族可选,但内饰能忍? 1610808303000 9 汽车 \n2 430221183 搭载135kw电机比亚迪秦plus纯电动版外观更精致 1612581556000 2 汽车 \n3 441756326 【提车作业】不顾他人眼光帕萨特phev俘获30老男人浪子心 1618825835000 23 汽车 \n4 443485341 魏延有反骨之心都能重用,赵云忠心为什么却不被重用? 1619484501000 4 历史 \n\n 二级分类 关键词 \n0 健康/疾病防护治疗及西医用药 医生:14.760494,吸烟:16.474872,板蓝根:15.597788,板蓝根^^熏... \n1 汽车/买车 155n:8.979802,polo:7.951116,中控台:5.954278,中网:7.... \n2 汽车/买车 etc:12.055207,代表:8.878175,内饰:5.342025,刀片:9.453... \n3 汽车/买车 丰田凯美瑞:12.772149,充电器:8.394001,品牌:8.436843,城市:7.... \n4 历史/中国史 三国:8.979797,五虎将:13.072728,人才:7.532783,保镖:6.811... ",
|
||||
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>item_id</th>\n <th>标题</th>\n <th>发文时间</th>\n <th>图片数量</th>\n <th>一级分类</th>\n <th>二级分类</th>\n <th>关键词</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>361653323</td>\n <td>疫情谣言粉碎机丨接种新冠疫苗后用麻药或致死?盘点最新疫情谣言,别被忽悠了</td>\n <td>1624522285000</td>\n <td>1</td>\n <td>健康</td>\n <td>健康/疾病防护治疗及西医用药</td>\n <td>医生:14.760494,吸烟:16.474872,板蓝根:15.597788,板蓝根^^熏...</td>\n </tr>\n <tr>\n <th>1</th>\n <td>426732705</td>\n <td>实拍本田飞度:空间真大,8万出头工薪族可选,但内饰能忍?</td>\n <td>1610808303000</td>\n <td>9</td>\n <td>汽车</td>\n <td>汽车/买车</td>\n <td>155n:8.979802,polo:7.951116,中控台:5.954278,中网:7....</td>\n </tr>\n <tr>\n <th>2</th>\n <td>430221183</td>\n <td>搭载135kw电机比亚迪秦plus纯电动版外观更精致</td>\n <td>1612581556000</td>\n <td>2</td>\n <td>汽车</td>\n <td>汽车/买车</td>\n <td>etc:12.055207,代表:8.878175,内饰:5.342025,刀片:9.453...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>441756326</td>\n <td>【提车作业】不顾他人眼光帕萨特phev俘获30老男人浪子心</td>\n <td>1618825835000</td>\n <td>23</td>\n <td>汽车</td>\n <td>汽车/买车</td>\n <td>丰田凯美瑞:12.772149,充电器:8.394001,品牌:8.436843,城市:7....</td>\n </tr>\n <tr>\n <th>4</th>\n <td>443485341</td>\n <td>魏延有反骨之心都能重用,赵云忠心为什么却不被重用?</td>\n <td>1619484501000</td>\n <td>4</td>\n <td>历史</td>\n <td>历史/中国史</td>\n <td>三国:8.979797,五虎将:13.072728,人才:7.532783,保镖:6.811...</td>\n </tr>\n </tbody>\n</table>\n</div>"
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"doc_info = pd.read_table(raw_data_path + '/doc_info.txt', sep='\\t')\n",
|
||||
"doc_info.columns = [\"item_id\", \"标题\", \"发文时间\", \"图片数量\", \"一级分类\", \"二级分类\", \"关键词\"]\n",
|
||||
"\n",
|
||||
"doc_info.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "7cf3ff94",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": " user_id item_id 展现时间 网路环境 刷新次数 展现位置 是否点击 消费时长(秒)\n0 1000014754 463510256 1624843756147 5 0 16 0 0\n1 1000014754 463852707 1624843756147 5 0 13 1 80\n2 1000014754 464757134 1625052999841 5 0 13 1 1050\n3 1000014754 464617167 1625052999841 5 0 16 1 286\n4 1000014754 465426190 1625382421168 5 0 5 0 0",
|
||||
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>user_id</th>\n <th>item_id</th>\n <th>展现时间</th>\n <th>网路环境</th>\n <th>刷新次数</th>\n <th>展现位置</th>\n <th>是否点击</th>\n <th>消费时长(秒)</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1000014754</td>\n <td>463510256</td>\n <td>1624843756147</td>\n <td>5</td>\n <td>0</td>\n <td>16</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1000014754</td>\n <td>463852707</td>\n <td>1624843756147</td>\n <td>5</td>\n <td>0</td>\n <td>13</td>\n <td>1</td>\n <td>80</td>\n </tr>\n <tr>\n <th>2</th>\n <td>1000014754</td>\n <td>464757134</td>\n <td>1625052999841</td>\n <td>5</td>\n <td>0</td>\n <td>13</td>\n <td>1</td>\n <td>1050</td>\n </tr>\n <tr>\n <th>3</th>\n <td>1000014754</td>\n <td>464617167</td>\n <td>1625052999841</td>\n <td>5</td>\n <td>0</td>\n <td>16</td>\n <td>1</td>\n <td>286</td>\n </tr>\n <tr>\n <th>4</th>\n <td>1000014754</td>\n <td>465426190</td>\n <td>1625382421168</td>\n <td>5</td>\n <td>0</td>\n <td>5</td>\n <td>0</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n</div>"
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"all_data = pd.read_csv(raw_data_path + '/train_data_5w.csv', sep='\\t', index_col=0)\n",
|
||||
"all_data.columns = [\"user_id\", \"item_id\", \"展现时间\", \"网路环境\", \"刷新次数\", \"展现位置\", \"是否点击\", \"消费时长(秒)\"]\n",
|
||||
"\n",
|
||||
"all_data.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": " user_id item_id 展现时间 网路环境 刷新次数 展现位置 是否点击 消费时长(秒) \\\n0 1000014754 463510256 2021-06-28 01:29:16 5 0 16 0 0 \n1 1000014754 463852707 2021-06-28 01:29:16 5 0 13 1 80 \n2 1000014754 464757134 2021-06-30 11:36:39 5 0 13 1 1050 \n3 1000014754 464617167 2021-06-30 11:36:39 5 0 16 1 286 \n4 1000014754 465426190 2021-07-04 07:07:01 5 0 5 0 0 \n\n 展现时间_日期 \n0 28 \n1 28 \n2 30 \n3 30 \n4 4 ",
|
||||
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>user_id</th>\n <th>item_id</th>\n <th>展现时间</th>\n <th>网路环境</th>\n <th>刷新次数</th>\n <th>展现位置</th>\n <th>是否点击</th>\n <th>消费时长(秒)</th>\n <th>展现时间_日期</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1000014754</td>\n <td>463510256</td>\n <td>2021-06-28 01:29:16</td>\n <td>5</td>\n <td>0</td>\n <td>16</td>\n <td>0</td>\n <td>0</td>\n <td>28</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1000014754</td>\n <td>463852707</td>\n <td>2021-06-28 01:29:16</td>\n <td>5</td>\n <td>0</td>\n <td>13</td>\n <td>1</td>\n <td>80</td>\n <td>28</td>\n </tr>\n <tr>\n <th>2</th>\n <td>1000014754</td>\n <td>464757134</td>\n <td>2021-06-30 11:36:39</td>\n <td>5</td>\n <td>0</td>\n <td>13</td>\n <td>1</td>\n <td>1050</td>\n <td>30</td>\n </tr>\n <tr>\n <th>3</th>\n <td>1000014754</td>\n <td>464617167</td>\n <td>2021-06-30 11:36:39</td>\n <td>5</td>\n <td>0</td>\n <td>16</td>\n <td>1</td>\n <td>286</td>\n <td>30</td>\n </tr>\n <tr>\n <th>4</th>\n <td>1000014754</td>\n <td>465426190</td>\n <td>2021-07-04 07:07:01</td>\n <td>5</td>\n <td>0</td>\n <td>5</td>\n <td>0</td>\n <td>0</td>\n <td>4</td>\n </tr>\n </tbody>\n</table>\n</div>"
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"all_data['展现时间'] = all_data['展现时间'].astype('str')\n",
|
||||
"all_data['展现时间'] = all_data['展现时间'].apply(lambda x: int(x[:10]))\n",
|
||||
"\n",
|
||||
"all_data['展现时间'] = pd.to_datetime(all_data['展现时间'], unit='s', errors='coerce')\n",
|
||||
"all_data['展现时间_日期'] = all_data['展现时间'].dt.day\n",
|
||||
"\n",
|
||||
"all_data.head()"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "73c9843e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": "75"
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"mode = 'debug'\n",
|
||||
"\n",
|
||||
"if mode == 'debug':\n",
|
||||
" all_data = shuffle(all_data)\n",
|
||||
" all_data.reset_index(drop=True)\n",
|
||||
"\n",
|
||||
" train_data = all_data[(all_data['展现时间_日期'] >= 5) & (all_data['展现时间_日期'] < 6)]\n",
|
||||
" test_data = all_data.loc[all_data['展现时间_日期'] == 6, :]\n",
|
||||
"else:\n",
|
||||
" train_data = all_data[(all_data['展现时间_日期'] >= 1) & (all_data['展现时间_日期'] < 6)]\n",
|
||||
" test_data = all_data.loc[all_data['展现时间_日期'] == 6, :]\n",
|
||||
"\n",
|
||||
"del all_data, doc_info, user_info\n",
|
||||
"gc.collect()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"15655\n",
|
||||
"33664\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(train_data['user_id'].nunique())\n",
|
||||
"print(train_data['item_id'].nunique())"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "345cc0d2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class UserCF(object):\n",
|
||||
" def __init__(self, his_data):\n",
|
||||
" self.user_set = set()\n",
|
||||
" self.item_set = set()\n",
|
||||
"\n",
|
||||
" self.his_data = his_data\n",
|
||||
" self.user_sim_matrix = dict()\n",
|
||||
" self.user_interacted_num = defaultdict(int)\n",
|
||||
" self.item_interacted_num = defaultdict(int) # 热门推荐时会用到\n",
|
||||
"\n",
|
||||
" def calculate_similarity_matrix(self):\n",
|
||||
" item2users = self.his_data.groupby('item_id')['user_id'].apply(list).reset_index()\n",
|
||||
"\n",
|
||||
" # print(f'计算ItemCF第一阶段...')\n",
|
||||
" pbar = tqdm(total=item2users.shape[0])\n",
|
||||
" for idx, row in item2users.iterrows():\n",
|
||||
" self.item_set.add(row['item_id'])\n",
|
||||
" self.user_set.update(row['user_id'])\n",
|
||||
" self.item_interacted_num[row['item_id']] += len(row['user_id'])\n",
|
||||
" for idx1, user_1 in enumerate(row['user_id']):\n",
|
||||
" self.user_interacted_num[user_1] += 1\n",
|
||||
" self.user_sim_matrix.setdefault(user_1, {})\n",
|
||||
" for idx2, user_2 in enumerate(row['user_id']):\n",
|
||||
" if user_1 == user_2:\n",
|
||||
" continue\n",
|
||||
" self.user_sim_matrix[user_1].setdefault(user_2, 0)\n",
|
||||
" # 热门物品用在计算用户之间相似度时,贡献小于非热门物品\n",
|
||||
" self.user_sim_matrix[user_1][user_2] += 1 / math.log(1 + len(row['user_id']))\n",
|
||||
" pbar.update(1)\n",
|
||||
" pbar.close()\n",
|
||||
"\n",
|
||||
" # 理论上,用户之间共现的物品越多,相似度越高\n",
|
||||
" # 但是,活跃用户与很多用户之间的相似度都很高\n",
|
||||
" print(f'计算UserCF第二阶段...')\n",
|
||||
" for user_1, related_users in tqdm(self.user_sim_matrix.items()):\n",
|
||||
" for user_2, weight in related_users.items():\n",
|
||||
" # 打压活跃用户\n",
|
||||
" self.user_sim_matrix[user_1][user_2] =\\\n",
|
||||
" weight / math.sqrt(self.user_interacted_num[user_1] * self.user_interacted_num[user_2])\n",
|
||||
"\n",
|
||||
" def __call__(self, users, _n=50, _topk=20):\n",
|
||||
" print(f'开始ItemCF召回: Recall@{topk}-Near@{_n}')\n",
|
||||
" user2items = self.his_data.groupby('user_id')['item_id'].apply(list)\n",
|
||||
" popular_items = [val[0] for val in sorted(\n",
|
||||
" self.item_interacted_num.items(), key=lambda x: x[1], reverse=True)[:_topk]]\n",
|
||||
"\n",
|
||||
" user_rec = {}\n",
|
||||
" for user_id in tqdm(users):\n",
|
||||
" # 新用户,直接推荐热门物品\n",
|
||||
" if user_id not in self.user_set:\n",
|
||||
" user_rec[user_id] = popular_items\n",
|
||||
" else:\n",
|
||||
" rank = defaultdict(int)\n",
|
||||
" for relate_user, user_smi_score in sorted(self.user_sim_matrix[user_id].items(),\n",
|
||||
" key=itemgetter(1), reverse=True)[:_n]:\n",
|
||||
" for candidate_item in user2items.loc[relate_user]:\n",
|
||||
" # if candidate_item in user2items.loc[user_id]:\n",
|
||||
" # continue\n",
|
||||
" rank[candidate_item] += user_smi_score\n",
|
||||
" rec_items = [item[0] for item in sorted(rank.items(), key=itemgetter(1), reverse=True)[:_topk]]\n",
|
||||
" # 如果推荐的物品不够,用热门物品进行填充\n",
|
||||
" rec_items += popular_items[:topk-len(rec_items)]\n",
|
||||
" user_rec[user_id] = rec_items\n",
|
||||
"\n",
|
||||
" return user_rec"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ucf_cls_path = os.path.join(new_data_path, 'user_cf')\n",
|
||||
"os.makedirs(ucf_cls_path, exist_ok=True)"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"demo_ucf_path = os.path.join(ucf_cls_path, mode+'_ufc.pkl')\n",
|
||||
"\n",
|
||||
"if os.path.exists(demo_ucf_path):\n",
|
||||
" with open(demo_ucf_path, 'rb') as file:\n",
|
||||
" demo_ucf = pickle.loads(file.read())\n",
|
||||
" file.close()\n",
|
||||
"else:\n",
|
||||
" demo_ucf = UserCF(train_data)\n",
|
||||
" demo_ucf.calculate_similarity_matrix()\n",
|
||||
" demo_ucf_pkl = pickle.dumps(demo_ucf)\n",
|
||||
"\n",
|
||||
" output_ucf = open(demo_ucf_path, 'wb')\n",
|
||||
" output_ucf.write(demo_ucf_pkl)\n",
|
||||
" output_ucf.close()"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"开始ItemCF召回: Recall@100-Near@50\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": " 0%| | 0/13792 [00:00<?, ?it/s]",
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"version_major": 2,
|
||||
"version_minor": 0,
|
||||
"model_id": "e7e74728073f45ba9d21802109555731"
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"n, topk = 50, 100\n",
|
||||
"\n",
|
||||
"# 召回\n",
|
||||
"test_users = test_data['user_id'].unique()\n",
|
||||
"icf_rec_result = demo_ucf(test_users, n, topk)\n",
|
||||
"\n",
|
||||
"test_user_group = test_data.groupby('user_id')['item_id'].agg(list).reset_index()\n",
|
||||
"test_pred = [icf_rec_result[user_id] for user_id in test_user_group['user_id']]\n",
|
||||
"test_true = test_user_group['item_id'].to_list()"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"MAP@100: 0.012998163356723474\n",
|
||||
"Recall@100: 0.14660816973054847\n",
|
||||
"Precision@100: 0.02628625290023202\n",
|
||||
"F1@100: 0.036618405618781665\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"PrintMetric(test_true, test_pred, topk)"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"outputs": [],
|
||||
"source": [],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Reference in New Issue
Block a user