diff --git a/.gitignore b/.gitignore index 4d591874..13bcf024 100644 --- a/.gitignore +++ b/.gitignore @@ -1,12 +1,12 @@ -/codes/news_recsys/news_rec_web/Vue-newsinfo/node_modules -/codes/news_recsys/news_rec_server/conf/__pycache__ -/codes/news_recsys/news_rec_server/controller/__pycache__ -/codes/news_recsys/news_rec_server/dao/__pycache__ -/codes/news_recsys/news_rec_server/dao/entity/__pycache__ -/codes/news_recsys/news_rec_server/logs -/codes/news_recsys/news_rec_server/materials/material_process/__pycache__ -/codes/news_recsys/news_rec_server/materials/news_scrapy/sinanews/__pycache__ -/codes/news_recsys/news_rec_server/materials/news_scrapy/sinanews/spiders/__pycache__ -/codes/news_recsys/news_rec_server/materials/user_process/__pycache__ -/codes/news_recsys/news_rec_server/recprocess/__pycache__ -/codes/news_recsys/news_rec_server/recprocess/recall/__pycache__ \ No newline at end of file +/codes/news_recsys/news_rec_web/Vue-newsinfo/node_modules/ +/codes/news_recsys/news_rec_server/conf/__pycache__/ +/codes/news_recsys/news_rec_server/controller/__pycache__/ +/codes/news_recsys/news_rec_server/dao/__pycache__/ +/codes/news_recsys/news_rec_server/dao/entity/__pycache__/ +/codes/news_recsys/news_rec_server/logs/ +/codes/news_recsys/news_rec_server/materials/material_process/__pycache__/ +/codes/news_recsys/news_rec_server/materials/news_scrapy/sinanews/__pycache__/ +/codes/news_recsys/news_rec_server/materials/news_scrapy/sinanews/spiders/__pycache__/ +/codes/news_recsys/news_rec_server/materials/user_process/__pycache__/ +/codes/news_recsys/news_rec_server/recprocess/__pycache__/ +/codes/news_recsys/news_rec_server/recprocess/recall/__pycache__/ \ No newline at end of file diff --git a/codes/news_recsys/news_rec_server/conf/proj_path.py b/codes/news_recsys/news_rec_server/conf/proj_path.py new file mode 100644 index 00000000..292fd1f2 --- /dev/null +++ b/codes/news_recsys/news_rec_server/conf/proj_path.py @@ -0,0 +1,6 @@ +import os + +home_path = os.environ['HOME'] +proj_path = home_path + "/news_rec_server/" + +stop_words_path = proj_path + "conf/stop_words.txt" \ No newline at end of file diff --git a/codes/news_recsys/news_rec_server/controller/__pycache__/log_controller.cpython-38.pyc b/codes/news_recsys/news_rec_server/controller/__pycache__/log_controller.cpython-38.pyc deleted file mode 100644 index 6025bfcc..00000000 Binary files a/codes/news_recsys/news_rec_server/controller/__pycache__/log_controller.cpython-38.pyc and /dev/null differ diff --git a/codes/news_recsys/news_rec_server/controller/__pycache__/user_action_controller.cpython-38.pyc b/codes/news_recsys/news_rec_server/controller/__pycache__/user_action_controller.cpython-38.pyc deleted file mode 100644 index 31457009..00000000 Binary files a/codes/news_recsys/news_rec_server/controller/__pycache__/user_action_controller.cpython-38.pyc and /dev/null differ diff --git a/codes/news_recsys/news_rec_server/dao/__pycache__/Mongo.cpython-38.pyc b/codes/news_recsys/news_rec_server/dao/__pycache__/Mongo.cpython-38.pyc deleted file mode 100644 index 55f6fc42..00000000 Binary files a/codes/news_recsys/news_rec_server/dao/__pycache__/Mongo.cpython-38.pyc and /dev/null differ diff --git a/codes/news_recsys/news_rec_server/dao/__pycache__/Mysql.cpython-38.pyc b/codes/news_recsys/news_rec_server/dao/__pycache__/Mysql.cpython-38.pyc deleted file mode 100644 index 5d9cc18d..00000000 Binary files a/codes/news_recsys/news_rec_server/dao/__pycache__/Mysql.cpython-38.pyc and /dev/null differ diff --git a/codes/news_recsys/news_rec_server/dao/__pycache__/Redis.cpython-38.pyc b/codes/news_recsys/news_rec_server/dao/__pycache__/Redis.cpython-38.pyc deleted file mode 100644 index db3b03be..00000000 Binary files a/codes/news_recsys/news_rec_server/dao/__pycache__/Redis.cpython-38.pyc and /dev/null differ diff --git a/codes/news_recsys/news_rec_server/dao/__pycache__/mongo_server.cpython-38.pyc b/codes/news_recsys/news_rec_server/dao/__pycache__/mongo_server.cpython-38.pyc deleted file mode 100644 index aa93fb9a..00000000 Binary files a/codes/news_recsys/news_rec_server/dao/__pycache__/mongo_server.cpython-38.pyc and /dev/null differ diff --git a/codes/news_recsys/news_rec_server/dao/__pycache__/mongo_server.cpython-39.pyc b/codes/news_recsys/news_rec_server/dao/__pycache__/mongo_server.cpython-39.pyc deleted file mode 100644 index ce3cae93..00000000 Binary files a/codes/news_recsys/news_rec_server/dao/__pycache__/mongo_server.cpython-39.pyc and /dev/null differ diff --git a/codes/news_recsys/news_rec_server/dao/__pycache__/mysql_server.cpython-38.pyc b/codes/news_recsys/news_rec_server/dao/__pycache__/mysql_server.cpython-38.pyc deleted file mode 100644 index 4b38f175..00000000 Binary files a/codes/news_recsys/news_rec_server/dao/__pycache__/mysql_server.cpython-38.pyc and /dev/null differ diff --git a/codes/news_recsys/news_rec_server/dao/__pycache__/redis_server.cpython-38.pyc b/codes/news_recsys/news_rec_server/dao/__pycache__/redis_server.cpython-38.pyc deleted file mode 100644 index a9254067..00000000 Binary files a/codes/news_recsys/news_rec_server/dao/__pycache__/redis_server.cpython-38.pyc and /dev/null differ diff --git a/codes/news_recsys/news_rec_server/dao/entity/__pycache__/logitem.cpython-38.pyc b/codes/news_recsys/news_rec_server/dao/entity/__pycache__/logitem.cpython-38.pyc deleted file mode 100644 index e7c42f99..00000000 Binary files a/codes/news_recsys/news_rec_server/dao/entity/__pycache__/logitem.cpython-38.pyc and /dev/null differ diff --git a/codes/news_recsys/news_rec_server/dao/entity/__pycache__/register_user.cpython-38.pyc b/codes/news_recsys/news_rec_server/dao/entity/__pycache__/register_user.cpython-38.pyc deleted file mode 100644 index 1c51d454..00000000 Binary files a/codes/news_recsys/news_rec_server/dao/entity/__pycache__/register_user.cpython-38.pyc and /dev/null differ diff --git a/codes/news_recsys/news_rec_server/dao/entity/__pycache__/user_collections.cpython-38.pyc b/codes/news_recsys/news_rec_server/dao/entity/__pycache__/user_collections.cpython-38.pyc deleted file mode 100644 index 9a5e91c9..00000000 Binary files a/codes/news_recsys/news_rec_server/dao/entity/__pycache__/user_collections.cpython-38.pyc and /dev/null differ diff --git a/codes/news_recsys/news_rec_server/dao/entity/__pycache__/user_exposure.cpython-38.pyc b/codes/news_recsys/news_rec_server/dao/entity/__pycache__/user_exposure.cpython-38.pyc deleted file mode 100644 index 437b2187..00000000 Binary files a/codes/news_recsys/news_rec_server/dao/entity/__pycache__/user_exposure.cpython-38.pyc and /dev/null differ diff --git a/codes/news_recsys/news_rec_server/dao/entity/__pycache__/user_likes.cpython-38.pyc b/codes/news_recsys/news_rec_server/dao/entity/__pycache__/user_likes.cpython-38.pyc deleted file mode 100644 index 3caaac25..00000000 Binary files a/codes/news_recsys/news_rec_server/dao/entity/__pycache__/user_likes.cpython-38.pyc and /dev/null differ diff --git a/codes/news_recsys/news_rec_server/dao/entity/__pycache__/user_read.cpython-38.pyc b/codes/news_recsys/news_rec_server/dao/entity/__pycache__/user_read.cpython-38.pyc deleted file mode 100644 index ecfc921b..00000000 Binary files a/codes/news_recsys/news_rec_server/dao/entity/__pycache__/user_read.cpython-38.pyc and /dev/null differ diff --git a/codes/news_recsys/news_rec_server/logs/material_and_user_process.log b/codes/news_recsys/news_rec_server/logs/material_and_user_process.log deleted file mode 100644 index 91381d67..00000000 --- a/codes/news_recsys/news_rec_server/logs/material_and_user_process.log +++ /dev/null @@ -1,20 +0,0 @@ -2021-12-04-00-10-21 -run update_new_items success. -update_dynamic_feature_protrail success. -delete RedisProtrail ... -run update_redis_mongo_protrail_data success. -process_material success. -process_user.py success. -news detail info are saved in redis db. -update_redis success. - -2021-12-05-00-14-24 -run update_new_items success. -update_dynamic_feature_protrail success. -delete RedisProtrail ... -run update_redis_mongo_protrail_data success. -process_material success. -process_user.py success. -news detail info are saved in redis db. -update_redis success. - diff --git a/codes/news_recsys/news_rec_server/logs/news_bad_cases.log b/codes/news_recsys/news_rec_server/logs/news_bad_cases.log deleted file mode 100644 index 8580f40f..00000000 --- a/codes/news_recsys/news_rec_server/logs/news_bad_cases.log +++ /dev/null @@ -1,151 +0,0 @@ -d04ec960-fb54-44c4-93d8-27aee82a14a5 -9eb67338-c4dd-4fab-b9f6-dd3d1f635078 -8e744b2e-283e-4880-a35d-010e22f9b6d1 -64a9131f-7bef-4026-af19-a437258b698b -5735d3ba-2ae7-44b0-87b1-a4212042dfd5 -2ec76526-1734-4631-85d5-38b53c289724 -2631c157-4bd1-469e-a69a-5cc40d56087e -0ed4e74d-5133-42fe-b9e9-c2f4a217aae6 -e6feadd0-b0ca-4dad-9cf0-e51d59208741 -de8f34ed-894f-454a-8af5-498cb5bfa416 -d6bfbcb5-e2aa-43af-85c1-a53776ee55da -c83f6e63-3614-46d4-9c3b-56acad2c6053 -ae39b902-7e4c-4392-972d-97d876e09802 -a2b457b0-232f-4175-a618-08bf257bff13 -8ce201a5-a59a-45e2-9c80-d1530213dd76 -5f52e821-b2f1-4328-85f0-62bc8e0b36e7 -328a2cc0-89bf-4626-b9ee-f3ee4c6873f8 -30fb8ac3-7cc8-4666-9aac-c66cd4cd8e20 -148e8b52-5407-4545-9c5a-1745236f8139 -06ab8ab1-0170-4fca-8f7a-900a82872378 -f9f4c879-005a-4d7a-827e-099666396bd4 -d04ec960-fb54-44c4-93d8-27aee82a14a5 -9eb67338-c4dd-4fab-b9f6-dd3d1f635078 -8e744b2e-283e-4880-a35d-010e22f9b6d1 -64a9131f-7bef-4026-af19-a437258b698b -5735d3ba-2ae7-44b0-87b1-a4212042dfd5 -2ec76526-1734-4631-85d5-38b53c289724 -2631c157-4bd1-469e-a69a-5cc40d56087e -0ed4e74d-5133-42fe-b9e9-c2f4a217aae6 -e6feadd0-b0ca-4dad-9cf0-e51d59208741 -de8f34ed-894f-454a-8af5-498cb5bfa416 -d6bfbcb5-e2aa-43af-85c1-a53776ee55da -c83f6e63-3614-46d4-9c3b-56acad2c6053 -ae39b902-7e4c-4392-972d-97d876e09802 -a2b457b0-232f-4175-a618-08bf257bff13 -8ce201a5-a59a-45e2-9c80-d1530213dd76 -5f52e821-b2f1-4328-85f0-62bc8e0b36e7 -328a2cc0-89bf-4626-b9ee-f3ee4c6873f8 -30fb8ac3-7cc8-4666-9aac-c66cd4cd8e20 -148e8b52-5407-4545-9c5a-1745236f8139 -06ab8ab1-0170-4fca-8f7a-900a82872378 -f9f4c879-005a-4d7a-827e-099666396bd4 -d04ec960-fb54-44c4-93d8-27aee82a14a5 -9eb67338-c4dd-4fab-b9f6-dd3d1f635078 -8e744b2e-283e-4880-a35d-010e22f9b6d1 -64a9131f-7bef-4026-af19-a437258b698b -5735d3ba-2ae7-44b0-87b1-a4212042dfd5 -2ec76526-1734-4631-85d5-38b53c289724 -2631c157-4bd1-469e-a69a-5cc40d56087e -0ed4e74d-5133-42fe-b9e9-c2f4a217aae6 -e6feadd0-b0ca-4dad-9cf0-e51d59208741 -de8f34ed-894f-454a-8af5-498cb5bfa416 -d6bfbcb5-e2aa-43af-85c1-a53776ee55da -c83f6e63-3614-46d4-9c3b-56acad2c6053 -ae39b902-7e4c-4392-972d-97d876e09802 -a2b457b0-232f-4175-a618-08bf257bff13 -8ce201a5-a59a-45e2-9c80-d1530213dd76 -5f52e821-b2f1-4328-85f0-62bc8e0b36e7 -328a2cc0-89bf-4626-b9ee-f3ee4c6873f8 -30fb8ac3-7cc8-4666-9aac-c66cd4cd8e20 -148e8b52-5407-4545-9c5a-1745236f8139 -06ab8ab1-0170-4fca-8f7a-900a82872378 -f9f4c879-005a-4d7a-827e-099666396bd4 -d04ec960-fb54-44c4-93d8-27aee82a14a5 -9eb67338-c4dd-4fab-b9f6-dd3d1f635078 -8e744b2e-283e-4880-a35d-010e22f9b6d1 -64a9131f-7bef-4026-af19-a437258b698b -5735d3ba-2ae7-44b0-87b1-a4212042dfd5 -2ec76526-1734-4631-85d5-38b53c289724 -2631c157-4bd1-469e-a69a-5cc40d56087e -0ed4e74d-5133-42fe-b9e9-c2f4a217aae6 -e6feadd0-b0ca-4dad-9cf0-e51d59208741 -de8f34ed-894f-454a-8af5-498cb5bfa416 -d6bfbcb5-e2aa-43af-85c1-a53776ee55da -c83f6e63-3614-46d4-9c3b-56acad2c6053 -ae39b902-7e4c-4392-972d-97d876e09802 -a2b457b0-232f-4175-a618-08bf257bff13 -8ce201a5-a59a-45e2-9c80-d1530213dd76 -5f52e821-b2f1-4328-85f0-62bc8e0b36e7 -328a2cc0-89bf-4626-b9ee-f3ee4c6873f8 -30fb8ac3-7cc8-4666-9aac-c66cd4cd8e20 -148e8b52-5407-4545-9c5a-1745236f8139 -06ab8ab1-0170-4fca-8f7a-900a82872378 -f9f4c879-005a-4d7a-827e-099666396bd4 -c1ea3624-7e16-41e3-91ca-5f2237c90016 -d04ec960-fb54-44c4-93d8-27aee82a14a5 -9eb67338-c4dd-4fab-b9f6-dd3d1f635078 -8e744b2e-283e-4880-a35d-010e22f9b6d1 -64a9131f-7bef-4026-af19-a437258b698b -5735d3ba-2ae7-44b0-87b1-a4212042dfd5 -2ec76526-1734-4631-85d5-38b53c289724 -2631c157-4bd1-469e-a69a-5cc40d56087e -0ed4e74d-5133-42fe-b9e9-c2f4a217aae6 -e6feadd0-b0ca-4dad-9cf0-e51d59208741 -de8f34ed-894f-454a-8af5-498cb5bfa416 -d6bfbcb5-e2aa-43af-85c1-a53776ee55da -c83f6e63-3614-46d4-9c3b-56acad2c6053 -ae39b902-7e4c-4392-972d-97d876e09802 -a2b457b0-232f-4175-a618-08bf257bff13 -8ce201a5-a59a-45e2-9c80-d1530213dd76 -5f52e821-b2f1-4328-85f0-62bc8e0b36e7 -328a2cc0-89bf-4626-b9ee-f3ee4c6873f8 -30fb8ac3-7cc8-4666-9aac-c66cd4cd8e20 -148e8b52-5407-4545-9c5a-1745236f8139 -06ab8ab1-0170-4fca-8f7a-900a82872378 -f9f4c879-005a-4d7a-827e-099666396bd4 -c1ea3624-7e16-41e3-91ca-5f2237c90016 -d04ec960-fb54-44c4-93d8-27aee82a14a5 -9eb67338-c4dd-4fab-b9f6-dd3d1f635078 -8e744b2e-283e-4880-a35d-010e22f9b6d1 -64a9131f-7bef-4026-af19-a437258b698b -5735d3ba-2ae7-44b0-87b1-a4212042dfd5 -2ec76526-1734-4631-85d5-38b53c289724 -2631c157-4bd1-469e-a69a-5cc40d56087e -0ed4e74d-5133-42fe-b9e9-c2f4a217aae6 -e6feadd0-b0ca-4dad-9cf0-e51d59208741 -de8f34ed-894f-454a-8af5-498cb5bfa416 -d6bfbcb5-e2aa-43af-85c1-a53776ee55da -c83f6e63-3614-46d4-9c3b-56acad2c6053 -ae39b902-7e4c-4392-972d-97d876e09802 -a2b457b0-232f-4175-a618-08bf257bff13 -8ce201a5-a59a-45e2-9c80-d1530213dd76 -5f52e821-b2f1-4328-85f0-62bc8e0b36e7 -328a2cc0-89bf-4626-b9ee-f3ee4c6873f8 -30fb8ac3-7cc8-4666-9aac-c66cd4cd8e20 -148e8b52-5407-4545-9c5a-1745236f8139 -06ab8ab1-0170-4fca-8f7a-900a82872378 -f9f4c879-005a-4d7a-827e-099666396bd4 -c1ea3624-7e16-41e3-91ca-5f2237c90016 -d04ec960-fb54-44c4-93d8-27aee82a14a5 -9eb67338-c4dd-4fab-b9f6-dd3d1f635078 -8e744b2e-283e-4880-a35d-010e22f9b6d1 -64a9131f-7bef-4026-af19-a437258b698b -5735d3ba-2ae7-44b0-87b1-a4212042dfd5 -2ec76526-1734-4631-85d5-38b53c289724 -2631c157-4bd1-469e-a69a-5cc40d56087e -0ed4e74d-5133-42fe-b9e9-c2f4a217aae6 -e6feadd0-b0ca-4dad-9cf0-e51d59208741 -de8f34ed-894f-454a-8af5-498cb5bfa416 -d6bfbcb5-e2aa-43af-85c1-a53776ee55da -c83f6e63-3614-46d4-9c3b-56acad2c6053 -ae39b902-7e4c-4392-972d-97d876e09802 -a2b457b0-232f-4175-a618-08bf257bff13 -8ce201a5-a59a-45e2-9c80-d1530213dd76 -5f52e821-b2f1-4328-85f0-62bc8e0b36e7 -328a2cc0-89bf-4626-b9ee-f3ee4c6873f8 -30fb8ac3-7cc8-4666-9aac-c66cd4cd8e20 -148e8b52-5407-4545-9c5a-1745236f8139 -06ab8ab1-0170-4fca-8f7a-900a82872378 -f9f4c879-005a-4d7a-827e-099666396bd4 -c1ea3624-7e16-41e3-91ca-5f2237c90016 diff --git a/codes/news_recsys/news_rec_server/logs/offline_material_process.log b/codes/news_recsys/news_rec_server/logs/offline_material_process.log deleted file mode 100644 index adbf1e45..00000000 --- a/codes/news_recsys/news_rec_server/logs/offline_material_process.log +++ /dev/null @@ -1,78 +0,0 @@ -2021-11-30-19-03-01 -scrapy crawl sina_spider --pages success. -run python monitor_news.py success. -run update_new_items success. -delete RedisProtrail ... -run update_redis_mongo_protrail_data success. -news detail info are saved in redis db. -material to mongo and redis success. - -2021-11-30-19-08-01 -scrapy crawl sina_spider --pages success. -run python monitor_news.py success. -run update_new_items success. -delete RedisProtrail ... -run update_redis_mongo_protrail_data success. -news detail info are saved in redis db. -material to mongo and redis success. - -material to mongo and redis fail. - -material to mongo and redis fail. - -2021-12-02-09-13-04 -scrapy crawl sina_spider --pages success. -the news nums of news_20211202 collection is 251 and less then 1000. -run python monitor_news.py success. -material to mongo and redis fail. - -material to mongo and redis fail. - -update_dynamic_feature_protrail success. -material to mongo and redis fail. - -update_dynamic_feature_protrail success. -run update_new_items success. -delete RedisProtrail ... -run update_redis_mongo_protrail_data success. -news detail info are saved in redis db. -material to mongo and redis success. - -2021-12-02-23-00-01 -scrapy crawl sina_spider --pages success. -the news nums of news_20211202 collection is 644 and less then 1000. -run python monitor_news.py success. -material to mongo and redis fail. - -material to mongo and redis fail. - -material to mongo and redis fail. - -update_dynamic_feature_protrail success. -run update_new_items success. -delete RedisProtrail ... -run update_redis_mongo_protrail_data success. -news detail info are saved in redis db. -material to mongo and redis success. - -2021-12-03-09-38-39 -scrapy crawl sina_spider --pages success. -the news nums of news_20211203 collection is 659 and less then 1000. -run python monitor_news.py success. -2021-12-03-09-50-04 -scrapy crawl sina_spider --pages success. -run python monitor_news.py success. -update_dynamic_feature_protrail success. -run update_new_items success. -delete RedisProtrail ... -run update_redis_mongo_protrail_data success. -news detail info are saved in redis db. -material to mongo and redis success. - -2021-12-04-00-00-01 -scrapy crawl sina_spider --pages success. -run python monitor_news.py success. -2021-12-05-00-00-01 -scrapy crawl sina_spider --pages success. -the news nums of news_20211205 collection is 793 and less then 1000. -run python monitor_news.py success. diff --git a/codes/news_recsys/news_rec_server/logs/offline_rec_list_to_redis.log b/codes/news_recsys/news_rec_server/logs/offline_rec_list_to_redis.log deleted file mode 100644 index 7311ac29..00000000 --- a/codes/news_recsys/news_rec_server/logs/offline_rec_list_to_redis.log +++ /dev/null @@ -1,98 +0,0 @@ -2021-11-30-19-03-01 -a sorted news_ids are saved into redis. -run /home/recsys/miniconda3/envs/news_rec_py3/bin/python /home/recsys/news_rec_server/recprocess/offline.py success. - -2021-11-30-19-08-19 -a sorted news_ids are saved into redis. -run /home/recsys/miniconda3/envs/news_rec_py3/bin/python /home/recsys/news_rec_server/recprocess/offline.py success. - -2021-12-01-01-00-01 -a sorted news_ids are saved into redis. -run /home/recsys/miniconda3/envs/news_rec_py3/bin/python /home/recsys/news_rec_server/recprocess/offline.py success. - -2021-12-02-01-00-01 -a sorted news_ids are saved into redis. -run /home/recsys/miniconda3/envs/news_rec_py3/bin/python /home/recsys/news_rec_server/recprocess/offline.py success. - -2021-12-02-09-13-30 -a sorted news_ids are saved into redis. -run /home/recsys/miniconda3/envs/news_rec_py3/bin/python /home/recsys/news_rec_server/recprocess/offline.py success. - -2021-12-02-09-18-07 -a sorted news_ids are saved into redis. -run /home/recsys/miniconda3/envs/news_rec_py3/bin/python /home/recsys/news_rec_server/recprocess/offline.py success. - -2021-12-02-09-23-18 -a sorted news_ids are saved into redis. -run /home/recsys/miniconda3/envs/news_rec_py3/bin/python /home/recsys/news_rec_server/recprocess/offline.py success. - -2021-12-03-01-00-02 -a sorted news_ids are saved into redis. -a hot rec list are saved into redis..... -run /home/recsys/miniconda3/envs/news_rec_py3/bin/python /home/recsys/news_rec_server/recprocess/offline.py success. - -2021-12-03-09-32-44 -a sorted news_ids are saved into redis. -a hot rec list are saved into redis..... -run /home/recsys/miniconda3/envs/news_rec_py3/bin/python /home/recsys/news_rec_server/recprocess/offline.py success. - -2021-12-03-09-33-10 -a sorted news_ids are saved into redis. -a hot rec list are saved into redis..... -run /home/recsys/miniconda3/envs/news_rec_py3/bin/python /home/recsys/news_rec_server/recprocess/offline.py success. - -2021-12-03-09-33-54 -a sorted news_ids are saved into redis. -a hot rec list are saved into redis..... -run /home/recsys/miniconda3/envs/news_rec_py3/bin/python /home/recsys/news_rec_server/recprocess/offline.py success. - -2021-12-03-10-05-18 -a sorted news_ids are saved into redis. -a hot rec list are saved into redis..... -run /home/recsys/miniconda3/envs/news_rec_py3/bin/python /home/recsys/news_rec_server/recprocess/offline.py success. - -2021-12-03-10-13-03 -a sorted news_ids are saved into redis. -a hot rec list are saved into redis..... -run /home/recsys/miniconda3/envs/news_rec_py3/bin/python /home/recsys/news_rec_server/recprocess/offline.py success. - -2021-12-03-10-14-12 -a sorted news_ids are saved into redis. -a hot rec list are saved into redis..... -run /home/recsys/miniconda3/envs/news_rec_py3/bin/python /home/recsys/news_rec_server/recprocess/offline.py success. - -2021-12-03-10-18-59 -a sorted news_ids are saved into redis. -a hot rec list are saved into redis..... -run /home/recsys/miniconda3/envs/news_rec_py3/bin/python /home/recsys/news_rec_server/recprocess/offline.py success. - -2021-12-03-10-22-57 -a sorted news_ids are saved into redis. -a hot rec list are saved into redis..... -run /home/recsys/miniconda3/envs/news_rec_py3/bin/python /home/recsys/news_rec_server/recprocess/offline.py success. - -2021-12-03-10-27-21 -a sorted news_ids are saved into redis. -a hot rec list are saved into redis..... -run /home/recsys/miniconda3/envs/news_rec_py3/bin/python /home/recsys/news_rec_server/recprocess/offline.py success. - -2021-12-03-10-28-49 -a sorted news_ids are saved into redis. -a hot rec list are saved into redis..... -run /home/recsys/miniconda3/envs/news_rec_py3/bin/python /home/recsys/news_rec_server/recprocess/offline.py success. - -2021-12-03-10-45-22 -a sorted news_ids are saved into redis. -a hot rec list are saved into redis..... -run /home/recsys/miniconda3/envs/news_rec_py3/bin/python /home/recsys/news_rec_server/recprocess/offline.py success. - -2021-12-04-00-11-59 -a sorted news_ids are saved into redis. -a hot rec list are saved into redis..... -run /home/recsys/miniconda3/envs/news_rec_py3/bin/python /home/recsys/news_rec_server/recprocess/offline.py success. - -2021-12-05-00-15-43 -a sorted news_ids are saved into redis. -a hot rec list are saved into redis..... -run /home/recsys/miniconda3/envs/news_rec_py3/bin/python /home/recsys/news_rec_server/recprocess/offline.py success. - diff --git a/codes/news_recsys/news_rec_server/materials/material_process/utils.py b/codes/news_recsys/news_rec_server/materials/material_process/utils.py index 82cba3f8..1d4e829d 100644 --- a/codes/news_recsys/news_rec_server/materials/material_process/utils.py +++ b/codes/news_recsys/news_rec_server/materials/material_process/utils.py @@ -7,7 +7,7 @@ import sys sys.path.append("../../") import jieba import jieba.analyse - +from conf.proj_path import stop_words_path def get_key_words(words_str): """提取中文中的关键词 @@ -22,8 +22,7 @@ def get_key_words(words_str): # 加载停用词 stopword_set = set() - # TODO 改成变量而不是写死 - with open('/home/recsys/news_rec_server/conf/stop_words.txt', encoding="utf-8") as f: + with open(stop_words_path, encoding="utf-8") as f: line = f.readline().rstrip() stopword_set.add(line) @@ -41,10 +40,10 @@ def get_key_words(words_str): key_words_list_tfidf = jieba.analyse.extract_tags(new_words_str, topK=10, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) key_words_list_textrank = jieba.analyse.textrank(new_words_str, topK=10, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) - # print("key_words_list_tfidf", key_words_list_tfidf) - # print("key_words_list_textrank", key_words_list_textrank) - tfidf_textrank_list = list(set(key_words_list_tfidf) & set(key_words_list_textrank))[:3] - # print(tfidf_textrank_list) return tfidf_textrank_list + +if __name__ == "__main__": + key_words = get_key_words("本教程主要是针对具有机器学习基础并想找推荐算法岗位的同学,由推荐算法基础、推荐算法入门赛、新闻推荐项目及推荐算法面经组成,形成了一个完整的从基础到实战再到面试的闭环。主要分为三个阶段,分别是推荐系统基础、推荐系统进阶和推荐算法面经,每个阶段的具体内容如下") + print(key_words) \ No newline at end of file diff --git a/codes/news_recsys/news_rec_server/recprocess/__pycache__/online.cpython-38.pyc b/codes/news_recsys/news_rec_server/recprocess/__pycache__/online.cpython-38.pyc index 1a632933..a48bb997 100644 Binary files a/codes/news_recsys/news_rec_server/recprocess/__pycache__/online.cpython-38.pyc and b/codes/news_recsys/news_rec_server/recprocess/__pycache__/online.cpython-38.pyc differ diff --git a/codes/news_recsys/news_rec_server/recprocess/offline.py b/codes/news_recsys/news_rec_server/recprocess/offline.py index 998f38e5..0f4569aa 100644 --- a/codes/news_recsys/news_rec_server/recprocess/offline.py +++ b/codes/news_recsys/news_rec_server/recprocess/offline.py @@ -5,7 +5,6 @@ from dao.mongo_server import MongoServer from dao.redis_server import RedisServer from recall.hot_recall import HotRecall from cold_start.cold_start import ColdStart -from datetime import datetime # 这个类是用来实现离线推荐流程的,给每个用户都存储一个倒排索引列表 # 对于热门页的内容,初始化的时候每个用户都是一样的 diff --git a/codes/news_recsys/news_rec_server/recprocess/online.py b/codes/news_recsys/news_rec_server/recprocess/online.py index ae91b1ac..e9f304cb 100644 --- a/codes/news_recsys/news_rec_server/recprocess/online.py +++ b/codes/news_recsys/news_rec_server/recprocess/online.py @@ -61,12 +61,12 @@ class OnlineServer(object): user_exposure_key = user_exposure_prefix + str(userid) # 一页默认10个item, 但这里候选20条,因为有可能有的在推荐页曝光过 - article_num = 50 + article_num = 200 # 返回的是一个news_id列表 zrevrange排序分值从大到小 candiate_id_list = self.reclist_redis_db.zrevrange(cold_start_user_key, 0, article_num-1) - print("candiate_id_list", candiate_id_list) + # print("candiate_id_list", candiate_id_list) if len(candiate_id_list) > 0: # 根据news_id获取新闻的具体内容,并返回一个列表,列表中的元素是按照顺序展示的新闻信息字典 @@ -190,13 +190,11 @@ class OnlineServer(object): self.reclist_redis_db.zunionstore(hot_list_user_key, ["hot_list"]) # 一页默认10个item, 但这里候选20条,因为有可能有的在推荐页曝光过 - article_num = 50 + article_num = 200 # 返回的是一个news_id列表 zrevrange排序分值从大到小 candiate_id_list = self.reclist_redis_db.zrevrange(hot_list_user_key, 0, article_num-1) - print("candiate_id_list", candiate_id_list) - if len(candiate_id_list) > 0: # 根据news_id获取新闻的具体内容,并返回一个列表,列表中的元素是按照顺序展示的新闻信息字典 news_info_list = [] @@ -229,8 +227,6 @@ class OnlineServer(object): f.write(news_id + "\n") print("there are not news detail info for {}".format(news_id)) continue - # news_info_str = news_info_str.replace("'", '"' ) # 将单引号都替换成双引号 - # news_info_dict = json.loads(news_info_str) # 需要确认一下前端接收的json,key需要是单引号还是双引号 news_info_list.append(news_info_dict) news_expose_list.add(news_id) @@ -247,8 +243,6 @@ class OnlineServer(object): # 曝光重新落表 self._save_user_exposure(user_id,news_expose_list) - #print(news_expose_list, len(news_expose_list)) - # print(news_info_list) return news_info_list else: #TODO 临时这么做,这么做不太好 @@ -262,22 +256,14 @@ class OnlineServer(object): def get_news_detail(self, news_id): """获取新闻展示的详细信息 """ - # print(1111) news_info_str = self.static_news_info_redis_db.get("static_news_detail:" + news_id) - # print(222) - # print(news_info_str) news_info_str = news_info_str.replace('\'', '\"' ) # 将单引号都替换成双引号 - # print(333) - # print(news_info_str) news_info_dit = json.loads(news_info_str) - # print(444) - # print("news_info_dit:", news_info_dit) news_dynamic_info_str = self.dynamic_news_info_redis_db.get("dynamic_news_detail:" + news_id) news_dynamic_info_str = news_dynamic_info_str.replace("'", '"' ) # 将单引号都替换成双引号 news_dynamic_info_dit = json.loads(news_dynamic_info_str) - # print("news_info_dit:", news_dynamic_info_dit) for k in news_dynamic_info_dit.keys(): news_info_dit[k] = news_dynamic_info_dit[k] @@ -298,9 +284,7 @@ class OnlineServer(object): news_dynamic_info_dict[action_type[0]] -=1 else: news_dynamic_info_dict["read_num"] +=1 - # print("update",news_dynamic_info_dict) news_dynamic_info_str = json.dumps(news_dynamic_info_dict) - # print("update",news_dynamic_info_str) news_dynamic_info_str = news_dynamic_info_str.replace('"', "'" ) res = self.dynamic_news_info_redis_db.set("dynamic_news_detail:" + news_id, news_dynamic_info_str) return res diff --git a/codes/news_recsys/news_rec_server/requirements.txt b/codes/news_recsys/news_rec_server/requirements.txt index cd178d30..38b789c6 100644 --- a/codes/news_recsys/news_rec_server/requirements.txt +++ b/codes/news_recsys/news_rec_server/requirements.txt @@ -3,6 +3,7 @@ attrs==21.2.0 Automat==20.2.0 certifi==2021.10.8 cffi==1.15.0 +charset-normalizer==2.0.8 click==8.0.3 constantly==15.1.0 cryptography==35.0.0 @@ -37,15 +38,19 @@ PyDispatcher==2.0.5 pymongo==3.12.1 PyMySQL==1.0.2 pyOpenSSL==21.0.0 +pysnowflake==0.1.3 queuelib==1.6.2 redis==3.5.3 +requests==2.26.0 Scrapy==2.5.1 selenium==4.0.0 service-identity==21.1.0 six==1.16.0 sniffio==1.2.0 +snowflake==0.0.3 sortedcontainers==2.4.0 SQLAlchemy==1.4.26 +tornado==6.1 trio==0.19.0 trio-websocket==0.9.2 Twisted==21.7.0 diff --git a/codes/news_recsys/news_rec_server/scheduler/crawl_news.sh b/codes/news_recsys/news_rec_server/scheduler/crawl_news.sh index 0da80e1f..3eb94c0a 100755 --- a/codes/news_recsys/news_rec_server/scheduler/crawl_news.sh +++ b/codes/news_recsys/news_rec_server/scheduler/crawl_news.sh @@ -1,9 +1,9 @@ #!/bin/bash - -# 这个脚本每天凌晨2点30会自动跑 -# 设置python环境 +# python 环境需要换成自己的虚拟环境中的Python python=/home/recsys/miniconda3/envs/news_rec_py3/bin/python -news_recsys_path="/home/recsys/news_rec_server" +home_path=$HOME + +news_recsys_path=${home_path}"/news_rec_server" # 得跳转到这个目录才能执行下面爬虫的命令 cd ${news_recsys_path}/materials/news_scrapy diff --git a/codes/news_recsys/news_rec_server/scheduler/offline_material_and_user_process.sh b/codes/news_recsys/news_rec_server/scheduler/offline_material_and_user_process.sh index d1cda05c..43620f95 100755 --- a/codes/news_recsys/news_rec_server/scheduler/offline_material_and_user_process.sh +++ b/codes/news_recsys/news_rec_server/scheduler/offline_material_and_user_process.sh @@ -1,7 +1,8 @@ #!/bin/bash - +# python 环境需要换成自己的虚拟环境中的Python python=/home/recsys/miniconda3/envs/news_rec_py3/bin/python -news_recsys_path="/home/recsys/news_rec_server" +home_path=$HOME +news_recsys_path=${home_path}"/news_rec_server" echo "$(date -d today +%Y-%m-%d-%H-%M-%S)" diff --git a/codes/news_recsys/news_rec_server/scheduler/run_offline.sh b/codes/news_recsys/news_rec_server/scheduler/run_offline.sh index 0793542e..a55d0443 100755 --- a/codes/news_recsys/news_rec_server/scheduler/run_offline.sh +++ b/codes/news_recsys/news_rec_server/scheduler/run_offline.sh @@ -1,7 +1,9 @@ #!/bin/bash +# python 环境需要换成自己的虚拟环境中的Python python=/home/recsys/miniconda3/envs/news_rec_py3/bin/python -news_recsys_path="/home/recsys/news_rec_server" +home_path=$HOME +news_recsys_path=${home_path}"/news_rec_server" cd ${news_recsys_path}/recprocess