Spaces:
Sleeping
Sleeping
Stefano Fiorucci
commited on
Commit
·
82fe524
1
Parent(s):
a251941
crawler refactoring
Browse files- crawler/README.md +15 -0
- crawler/data/.gitkeep +0 -0
- crawler/requirements.txt +1 -1
- crawler/tpcrawler/tpcrawler/spiders/tpcrawler.py +32 -30
crawler/README.md
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Twin Peaks crawler
|
| 2 |
+
|
| 3 |
+
This crawler download texts and metadata from [Twin Peaks Fandom Wiki](https://twinpeaks.fandom.com/wiki/Twin_Peaks_Wiki). The output format is JSON. The crawler is based on the combination of [Scrapy](https://github.com/scrapy/scrapy) and [fandom-py](https://github.com/NikolajDanger/fandom-py).
|
| 4 |
+
|
| 5 |
+
*Several wiki pages are discarded, since they are not related to Twin Peaks plot and create noise in the Question Answering index.*
|
| 6 |
+
|
| 7 |
+
## Installation
|
| 8 |
+
- `pip install -r requirements.txt`
|
| 9 |
+
- copy this folder (if needed, see [stackoverflow](https://stackoverflow.com/questions/7106012/download-a-single-folder-or-directory-from-a-github-repo))
|
| 10 |
+
|
| 11 |
+
## Usage
|
| 12 |
+
- (if needed, activate the virtual environment)
|
| 13 |
+
- `cd tpcrawler`
|
| 14 |
+
- `scrapy crawl tpcrawler`
|
| 15 |
+
- you can find the downloaded pages in `data` subfolder
|
crawler/data/.gitkeep
ADDED
|
File without changes
|
crawler/requirements.txt
CHANGED
|
@@ -1,2 +1,2 @@
|
|
| 1 |
fandom-py==0.2.1
|
| 2 |
-
Scrapy==2.
|
|
|
|
| 1 |
fandom-py==0.2.1
|
| 2 |
+
Scrapy==2.6.1
|
crawler/tpcrawler/tpcrawler/spiders/tpcrawler.py
CHANGED
|
@@ -1,14 +1,11 @@
|
|
| 1 |
import scrapy
|
| 2 |
-
from scrapy.utils.response import open_in_browser
|
| 3 |
from scrapy.http import TextResponse
|
| 4 |
import re
|
| 5 |
import fandom
|
| 6 |
import json
|
| 7 |
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
article_id_pattern="wgArticleId\"\:([0-9]+)"
|
| 11 |
-
categories_xpath="//div[@class='page-header__categories']/a//text()"
|
| 12 |
excluded_categories=set("""Twin Peaks (2017) crew
|
| 13 |
Actors
|
| 14 |
Camera and electrical department
|
|
@@ -34,7 +31,13 @@ Decades
|
|
| 34 |
Days
|
| 35 |
Production timeline""".split("\n"))
|
| 36 |
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
class Tpcrawler(scrapy.Spider):
|
| 40 |
name = 'tpcrawler'
|
|
@@ -43,40 +46,39 @@ class Tpcrawler(scrapy.Spider):
|
|
| 43 |
|
| 44 |
|
| 45 |
def parse(self, response):
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
hrefs = response.xpath("//ul[@class='mw-allpages-chunk']/li/a[not(contains(@class, 'redirect'))]/@href").extract()
|
| 49 |
for href in hrefs:
|
| 50 |
-
yield scrapy.Request(url=response.urljoin(href),
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
next_page = response.xpath("//div[@class='mw-allpages-nav']/a[contains(.,'Next page')]/@href").extract_first()
|
| 55 |
|
|
|
|
| 56 |
if next_page:
|
| 57 |
-
yield scrapy.Request(url=response.urljoin(next_page),
|
|
|
|
| 58 |
|
| 59 |
def parse_page(self, response: TextResponse):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
categories = set(response.xpath(categories_xpath).extract())
|
| 61 |
-
|
|
|
|
|
|
|
| 62 |
if len(categories.intersection(excluded_categories))==0:
|
| 63 |
name = response.meta['name']
|
| 64 |
url = response.meta['url']
|
| 65 |
article_id = int(re.findall(article_id_pattern, response.text)[0])
|
| 66 |
|
| 67 |
-
|
| 68 |
-
page = fandom.page(pageid = article_id)
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
json_content={'name': name, 'url':url, 'text':text}
|
| 73 |
-
|
| 74 |
with open(f'./data/{name}.json','w', encoding='utf-8') as fout:
|
| 75 |
-
json.dump(json_content, fout)
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
|
|
|
| 1 |
import scrapy
|
|
|
|
| 2 |
from scrapy.http import TextResponse
|
| 3 |
import re
|
| 4 |
import fandom
|
| 5 |
import json
|
| 6 |
|
| 7 |
+
# Categories unrelated to Twin Peaks plot
|
| 8 |
+
# (they make noise in the index)
|
|
|
|
|
|
|
| 9 |
excluded_categories=set("""Twin Peaks (2017) crew
|
| 10 |
Actors
|
| 11 |
Camera and electrical department
|
|
|
|
| 31 |
Days
|
| 32 |
Production timeline""".split("\n"))
|
| 33 |
|
| 34 |
+
fandom.set_wiki("Twinpeaks")
|
| 35 |
+
|
| 36 |
+
article_id_pattern = "wgArticleId\"\:([0-9]+)"
|
| 37 |
+
categories_xpath = "//div[@class='page-header__categories']/a//text()"
|
| 38 |
+
wiki_page_href_xpath = "//ul[@class='mw-allpages-chunk']/li/a[not(contains(@class, 'redirect'))]/@href"
|
| 39 |
+
next_page_href_xpath = "//div[@class='mw-allpages-nav']/a[contains(.,'Next page')]/@href"
|
| 40 |
+
|
| 41 |
|
| 42 |
class Tpcrawler(scrapy.Spider):
|
| 43 |
name = 'tpcrawler'
|
|
|
|
| 46 |
|
| 47 |
|
| 48 |
def parse(self, response):
|
| 49 |
+
"""Start from wiki "all pages" list and open them"""
|
| 50 |
+
hrefs = response.xpath(wiki_page_href_xpath).extract()
|
|
|
|
| 51 |
for href in hrefs:
|
| 52 |
+
yield scrapy.Request(url=response.urljoin(href),
|
| 53 |
+
callback=self.parse_page, dont_filter=True,
|
| 54 |
+
meta={'name':href.rpartition('/')[-1],
|
| 55 |
+
'url':response.urljoin(href)})
|
|
|
|
| 56 |
|
| 57 |
+
next_page = response.xpath(next_page_href_xpath).extract_first()
|
| 58 |
if next_page:
|
| 59 |
+
yield scrapy.Request(url=response.urljoin(next_page),
|
| 60 |
+
callback=self.parse, dont_filter=True)
|
| 61 |
|
| 62 |
def parse_page(self, response: TextResponse):
|
| 63 |
+
"""
|
| 64 |
+
Collect all interesting pages IDs
|
| 65 |
+
and use the Fandom API to crawl them.
|
| 66 |
+
Save the output in JSON format.
|
| 67 |
+
"""
|
| 68 |
+
|
| 69 |
categories = set(response.xpath(categories_xpath).extract())
|
| 70 |
+
|
| 71 |
+
# the wiki page is interesting only if related to plot
|
| 72 |
+
# (= not contained in excluded categories)
|
| 73 |
if len(categories.intersection(excluded_categories))==0:
|
| 74 |
name = response.meta['name']
|
| 75 |
url = response.meta['url']
|
| 76 |
article_id = int(re.findall(article_id_pattern, response.text)[0])
|
| 77 |
|
| 78 |
+
# once the ID is found, use the Fandom API to retrieve the clean page text
|
| 79 |
+
page = fandom.page(pageid = article_id)
|
| 80 |
+
text = page.plain_text.split('\nAppearances\n')[0]\
|
| 81 |
+
.split('\nReferences\n')[0]
|
|
|
|
| 82 |
json_content={'name': name, 'url':url, 'text':text}
|
|
|
|
| 83 |
with open(f'./data/{name}.json','w', encoding='utf-8') as fout:
|
| 84 |
+
json.dump(json_content, fout)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|