From 8ad200eb01472b675d18a333e35ca6c89c530f64 Mon Sep 17 00:00:00 2001 From: erenup Date: Thu, 2 Sep 2021 08:14:50 +0800 Subject: [PATCH] fix --- .../4.5-生成任务-语言模型.ipynb | 239 +++--------------- .../4.5-生成任务-语言模型.md | 221 ++-------------- 2 files changed, 49 insertions(+), 411 deletions(-) diff --git a/docs/篇章4-使用Transformers解决NLP任务/4.5-生成任务-语言模型.ipynb b/docs/篇章4-使用Transformers解决NLP任务/4.5-生成任务-语言模型.ipynb index aa1a322..a36bfa1 100644 --- a/docs/篇章4-使用Transformers解决NLP任务/4.5-生成任务-语言模型.ipynb +++ b/docs/篇章4-使用Transformers解决NLP任务/4.5-生成任务-语言模型.ipynb @@ -97,29 +97,7 @@ "output_type": "stream", "name": "stderr", "text": [ - "Downloading: 8.33kB [00:00, 1.49MB/s] \n", - "Downloading: 5.83kB [00:00, 1.77MB/s] \n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Downloading and preparing dataset wikitext/wikitext-2-raw-v1 (download: 4.50 MiB, generated: 12.91 MiB, post-processed: Unknown size, total: 17.41 MiB) to /Users/niepig/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/aa5e094000ec7afeb74c3be92c88313cd6f132d564c7effd961c10fd47c76f20...\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "Downloading: 100%|██████████| 4.72M/4.72M [00:02<00:00, 1.91MB/s]\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Dataset wikitext downloaded and prepared to /Users/niepig/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/aa5e094000ec7afeb74c3be92c88313cd6f132d564c7effd961c10fd47c76f20. Subsequent calls will reuse this data.\n" + "Reusing dataset wikitext (/Users/niepig/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/aa5e094000ec7afeb74c3be92c88313cd6f132d564c7effd961c10fd47c76f20)\n" ] } ], @@ -262,15 +240,15 @@ " \n", " \n", " 0\n", - " Plum cakes made with fresh plums came with other migrants from other traditions in which plum cake is prepared using plum as a primary ingredient . In some versions , the plums may become jam @-@ like inside the cake after cooking , or be prepared using plum jam . Plum cake prepared with plums is also a part of Ashkenazi Jewish cuisine , and is referred to as Pflaumenkuchen or Zwetschgenkuchen . Other plum @-@ based cakes are found in French , Italian and Polish cooking . \\n\n", + " On 3 March 1967 , parliament decided to build four short take @-@ off and landing airports along the Helgeland coast between Trondheim and Bodø . Braathens placed an order for a de Havilland Canada DHC @-@ 6 Twin Otter and planned to start the company Braathens STOL . It applied to operate the route without subsidies , but the concession was rejected and granted with subsidies to Widerøe , which had been operating the routes using seaplanes . \\n\n", " \n", " \n", " 1\n", - " = = = Language = = = \\n\n", + " \n", " \n", " \n", " 2\n", - " \n", + " Rao Ramesh was cast as a tantrik who helps Gill 's character in the present era . Mumaith Khan was selected for another item number , a remix version of the hit song \" Bangaru Kodipetta \" from Gharana Mogudu ( 1992 ) ; Gharana Mogudu 's music was also composed by M. M. Keeravani . Chiranjeevi made a special appearance after the song , making Magadheera the first film he appeared in after his entry into politics . When Rajamouli suggested the idea of a cameo appearance , Chiranjeevi was initially hesitant till the director narrated the complete sequence and the importance of the song . \\n\n", " \n", " \n", " 3\n", @@ -278,23 +256,23 @@ " \n", " \n", " 4\n", - " The town 's population not only recovered but grew ; the 1906 census of the Canadian Prairies listed the population at 1 @,@ 178 . A new study commissioned by the Dominion government determined that the cracks in the mountain continued to grow and that the risk of another slide remained . Consequently , parts of Frank closest to the mountain were dismantled or relocated to safer areas . \\n\n", - " \n", - " \n", - " 5\n", - " The Litigators is a 2011 legal thriller novel by John Grisham , his 25th fiction novel overall . The Litigators is about a two @-@ partner Chicago law firm attempting to strike it rich in a class action lawsuit over a cholesterol reduction drug by a major pharmaceutical drug company . The protagonist is a Harvard Law School grad big law firm burnout who stumbles upon the boutique and joins it only to find himself litigating against his old law firm in this case . The book is regarded as more humorous than most of Grisham 's prior novels . \\n\n", - " \n", - " \n", - " 6\n", " \n", " \n", " \n", + " 5\n", + " = = = Total Nonstop Action Wrestling ( 2015 – present ) = = = \\n\n", + " \n", + " \n", + " 6\n", + " The Daily Telegraph gave the visual novel the award for \" Best Script \" in its video game awards of 2011 , stating that \" Love 's layered narrative of a high school teacher embroiled in his student ’ s worries goes places most mainstream video games wouldn 't dare . \" \\n\n", + " \n", + " \n", " 7\n", - " On December 7 , 2006 , Headquarters Marine Corps released a message stating that 2nd Battalion 9th Marines would be reactivated during 2007 as part of the continuing Global War on Terror . 2nd Battalion 9th Marines was re @-@ activated on July 13 , 2007 and replaced the Anti @-@ Terrorism Battalion ( ATBn ) . In September 2008 , Marines and Sailors from 2 / 9 deployed to Al Anbar Province in support of Operation Iraqi Freedom . They were based in the city of Ramadi and returned in April 2009 without any Marines or Sailors killed in action . July 2010 Marines and Sailors from 2 / 9 deployed to Marjah , Helmand Province , Afghanistan in support of Operation Enduring Freedom . In December 2010 Echo Company from 2 / 9 were attached to 3 / 5 in Sangin , Afghanistan where they earned the notorious nickname of \" Green Hats . \" They returned February 2011 . They redeployed back to Marjah December 2011 and returned July 2012 . Echo and Weapons companies deployed once more to Afghanistan from January through April 2013 , participating in combat operations out of Camp Leatherneck . On April 1 , 2015 the battalion was deactivated in a ceremony at Camp Lejeune . \\n\n", + " \n", " \n", " \n", " 8\n", - " ( i ) = Indoor \\n\n", + " \n", " \n", " \n", " 9\n", @@ -383,18 +361,7 @@ " \n", "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)" ], - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "Downloading: 100%|██████████| 762/762 [00:00<00:00, 358kB/s]\n", - "Downloading: 100%|██████████| 1.04M/1.04M [00:04<00:00, 235kB/s]\n", - "Downloading: 100%|██████████| 456k/456k [00:02<00:00, 217kB/s]\n", - "Downloading: 100%|██████████| 1.36M/1.36M [00:05<00:00, 252kB/s]\n" - ] - } - ], + "outputs": [], "metadata": { "id": "mQwZ5UssWdB_" } @@ -431,72 +398,11 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "source": [ "tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=[\"text\"])" ], - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "#0: 0%| | 0/2 [00:00\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtrainer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/Desktop/zhihu/learn-nlp-with-transformers/venv/lib/python3.8/site-packages/transformers/trainer.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self, resume_from_checkpoint, trial, **kwargs)\u001b[0m\n\u001b[1;32m 1032\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontrol\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcallback_handler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mon_epoch_begin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontrol\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1033\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1034\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mstep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mepoch_iterator\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1035\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1036\u001b[0m \u001b[0;31m# Skip past any already trained steps if resuming training\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Desktop/zhihu/learn-nlp-with-transformers/venv/lib/python3.8/site-packages/torch/utils/data/dataloader.py\u001b[0m in \u001b[0;36m__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 519\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sampler_iter\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 520\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 521\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_next_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 522\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_num_yielded\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 523\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_dataset_kind\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0m_DatasetKind\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mIterable\u001b[0m \u001b[0;32mand\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Desktop/zhihu/learn-nlp-with-transformers/venv/lib/python3.8/site-packages/torch/utils/data/dataloader.py\u001b[0m in \u001b[0;36m_next_data\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 559\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_next_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 560\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_next_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# may raise StopIteration\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 561\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_dataset_fetcher\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfetch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# may raise StopIteration\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 562\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_pin_memory\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 563\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_utils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpin_memory\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpin_memory\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Desktop/zhihu/learn-nlp-with-transformers/venv/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py\u001b[0m in \u001b[0;36mfetch\u001b[0;34m(self, possibly_batched_index)\u001b[0m\n\u001b[1;32m 42\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mfetch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpossibly_batched_index\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 43\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mauto_collation\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 44\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdataset\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0midx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mpossibly_batched_index\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 45\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 46\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdataset\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mpossibly_batched_index\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/Desktop/zhihu/learn-nlp-with-transformers/venv/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 42\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mfetch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpossibly_batched_index\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 43\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mauto_collation\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 44\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdataset\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0midx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mpossibly_batched_index\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 45\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 46\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdataset\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mpossibly_batched_index\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mKeyError\u001b[0m: 1" + " 0%| | 31/7002 [04:16<14:27:52, 7.47s/it]" ] } ], @@ -1070,8 +891,8 @@ "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", - " train_dataset=lm_datasets[\"train\"][:1000],\n", - " eval_dataset=lm_datasets[\"validation\"][:100],\n", + " train_dataset=lm_datasets[\"train\"],\n", + " eval_dataset=lm_datasets[\"validation\"],\n", " data_collator=data_collator,\n", ")" ], diff --git a/docs/篇章4-使用Transformers解决NLP任务/4.5-生成任务-语言模型.md b/docs/篇章4-使用Transformers解决NLP任务/4.5-生成任务-语言模型.md index ea1aadd..6c44242 100644 --- a/docs/篇章4-使用Transformers解决NLP任务/4.5-生成任务-语言模型.md +++ b/docs/篇章4-使用Transformers解决NLP任务/4.5-生成任务-语言模型.md @@ -41,17 +41,7 @@ from datasets import load_dataset datasets = load_dataset('wikitext', 'wikitext-2-raw-v1') ``` - Downloading: 8.33kB [00:00, 1.49MB/s] - Downloading: 5.83kB [00:00, 1.77MB/s] - - - Downloading and preparing dataset wikitext/wikitext-2-raw-v1 (download: 4.50 MiB, generated: 12.91 MiB, post-processed: Unknown size, total: 17.41 MiB) to /Users/niepig/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/aa5e094000ec7afeb74c3be92c88313cd6f132d564c7effd961c10fd47c76f20... - - - Downloading: 100%|██████████| 4.72M/4.72M [00:02<00:00, 1.91MB/s] - - - Dataset wikitext downloaded and prepared to /Users/niepig/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/aa5e094000ec7afeb74c3be92c88313cd6f132d564c7effd961c10fd47c76f20. Subsequent calls will reuse this data. + Reusing dataset wikitext (/Users/niepig/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/aa5e094000ec7afeb74c3be92c88313cd6f132d564c7effd961c10fd47c76f20) 如果碰到以下错误: @@ -127,15 +117,15 @@ show_random_elements(datasets["train"]) 0 - Plum cakes made with fresh plums came with other migrants from other traditions in which plum cake is prepared using plum as a primary ingredient . In some versions , the plums may become jam @-@ like inside the cake after cooking , or be prepared using plum jam . Plum cake prepared with plums is also a part of Ashkenazi Jewish cuisine , and is referred to as Pflaumenkuchen or Zwetschgenkuchen . Other plum @-@ based cakes are found in French , Italian and Polish cooking . \n + On 3 March 1967 , parliament decided to build four short take @-@ off and landing airports along the Helgeland coast between Trondheim and Bodø . Braathens placed an order for a de Havilland Canada DHC @-@ 6 Twin Otter and planned to start the company Braathens STOL . It applied to operate the route without subsidies , but the concession was rejected and granted with subsidies to Widerøe , which had been operating the routes using seaplanes . \n 1 - = = = Language = = = \n + 2 - + Rao Ramesh was cast as a tantrik who helps Gill 's character in the present era . Mumaith Khan was selected for another item number , a remix version of the hit song " Bangaru Kodipetta " from Gharana Mogudu ( 1992 ) ; Gharana Mogudu 's music was also composed by M. M. Keeravani . Chiranjeevi made a special appearance after the song , making Magadheera the first film he appeared in after his entry into politics . When Rajamouli suggested the idea of a cameo appearance , Chiranjeevi was initially hesitant till the director narrated the complete sequence and the importance of the song . \n 3 @@ -143,23 +133,23 @@ show_random_elements(datasets["train"]) 4 - The town 's population not only recovered but grew ; the 1906 census of the Canadian Prairies listed the population at 1 @,@ 178 . A new study commissioned by the Dominion government determined that the cracks in the mountain continued to grow and that the risk of another slide remained . Consequently , parts of Frank closest to the mountain were dismantled or relocated to safer areas . \n - - - 5 - The Litigators is a 2011 legal thriller novel by John Grisham , his 25th fiction novel overall . The Litigators is about a two @-@ partner Chicago law firm attempting to strike it rich in a class action lawsuit over a cholesterol reduction drug by a major pharmaceutical drug company . The protagonist is a Harvard Law School grad big law firm burnout who stumbles upon the boutique and joins it only to find himself litigating against his old law firm in this case . The book is regarded as more humorous than most of Grisham 's prior novels . \n - - - 6 + + 5 + = = = Total Nonstop Action Wrestling ( 2015 – present ) = = = \n + + + 6 + The Daily Telegraph gave the visual novel the award for " Best Script " in its video game awards of 2011 , stating that " Love 's layered narrative of a high school teacher embroiled in his student ’ s worries goes places most mainstream video games wouldn 't dare . " \n + 7 - On December 7 , 2006 , Headquarters Marine Corps released a message stating that 2nd Battalion 9th Marines would be reactivated during 2007 as part of the continuing Global War on Terror . 2nd Battalion 9th Marines was re @-@ activated on July 13 , 2007 and replaced the Anti @-@ Terrorism Battalion ( ATBn ) . In September 2008 , Marines and Sailors from 2 / 9 deployed to Al Anbar Province in support of Operation Iraqi Freedom . They were based in the city of Ramadi and returned in April 2009 without any Marines or Sailors killed in action . July 2010 Marines and Sailors from 2 / 9 deployed to Marjah , Helmand Province , Afghanistan in support of Operation Enduring Freedom . In December 2010 Echo Company from 2 / 9 were attached to 3 / 5 in Sangin , Afghanistan where they earned the notorious nickname of " Green Hats . " They returned February 2011 . They redeployed back to Marjah December 2011 and returned July 2012 . Echo and Weapons companies deployed once more to Afghanistan from January through April 2013 , participating in combat operations out of Camp Leatherneck . On April 1 , 2015 the battalion was deactivated in a ceremony at Camp Lejeune . \n + 8 - ( i ) = Indoor \n + 9 @@ -201,12 +191,6 @@ from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True) ``` - Downloading: 100%|██████████| 762/762 [00:00<00:00, 358kB/s] - Downloading: 100%|██████████| 1.04M/1.04M [00:04<00:00, 235kB/s] - Downloading: 100%|██████████| 456k/456k [00:02<00:00, 217kB/s] - Downloading: 100%|██████████| 1.36M/1.36M [00:05<00:00, 252kB/s] - - 我们现在可以对所有的文本调用分词器,该操作可以简单地使用来自Datasets库的map方法实现。首先,我们定义一个在文本上调用标记器的函数: @@ -223,62 +207,6 @@ def tokenize_function(examples): tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"]) ``` - #0: 0%| | 0/2 [00:00 - ----> 1 trainer.train() - - - ~/Desktop/zhihu/learn-nlp-with-transformers/venv/lib/python3.8/site-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, **kwargs) - 1032 self.control = self.callback_handler.on_epoch_begin(self.args, self.state, self.control) - 1033 - -> 1034 for step, inputs in enumerate(epoch_iterator): - 1035 - 1036 # Skip past any already trained steps if resuming training - - - ~/Desktop/zhihu/learn-nlp-with-transformers/venv/lib/python3.8/site-packages/torch/utils/data/dataloader.py in __next__(self) - 519 if self._sampler_iter is None: - 520 self._reset() - --> 521 data = self._next_data() - 522 self._num_yielded += 1 - 523 if self._dataset_kind == _DatasetKind.Iterable and \ - - - ~/Desktop/zhihu/learn-nlp-with-transformers/venv/lib/python3.8/site-packages/torch/utils/data/dataloader.py in _next_data(self) - 559 def _next_data(self): - 560 index = self._next_index() # may raise StopIteration - --> 561 data = self._dataset_fetcher.fetch(index) # may raise StopIteration - 562 if self._pin_memory: - 563 data = _utils.pin_memory.pin_memory(data) - - - ~/Desktop/zhihu/learn-nlp-with-transformers/venv/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index) - 42 def fetch(self, possibly_batched_index): - 43 if self.auto_collation: - ---> 44 data = [self.dataset[idx] for idx in possibly_batched_index] - 45 else: - 46 data = self.dataset[possibly_batched_index] - - - ~/Desktop/zhihu/learn-nlp-with-transformers/venv/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py in (.0) - 42 def fetch(self, possibly_batched_index): - 43 if self.auto_collation: - ---> 44 data = [self.dataset[idx] for idx in possibly_batched_index] - 45 else: - 46 data = self.dataset[possibly_batched_index] - - - KeyError: 1 - + 0%| | 31/7002 [04:16<14:27:52, 7.47s/it] 一旦训练完成,我们就可以评估我们的模型,得到它在验证集上的perplexity,如下所示: @@ -597,8 +414,8 @@ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probabi trainer = Trainer( model=model, args=training_args, - train_dataset=lm_datasets["train"][:1000], - eval_dataset=lm_datasets["validation"][:100], + train_dataset=lm_datasets["train"], + eval_dataset=lm_datasets["validation"], data_collator=data_collator, ) ```