import lxml.etree
import json
# receives the extract result from the spider and appends them into a JSON Line file, (each line is a json)
class JsonWriterPipeline(object):
def open_spider(self, spider):
self.file = open('newsresult.json', 'w')
def close_spider(self, spider):
print('JSON File Generated')
self.file.close()
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
return item
import logging
import scrapy
from scrapy.spiders import Rule, CrawlSpider
from scrapy.linkextractors import LinkExtractor
class NewsItem(scrapy.Item):
# define the fields for your item here like:
headline = scrapy.Field()
intro = scrapy.Field()
# url = scrapy.Field()
class NewsSpider(CrawlSpider):
name = "bbcnews"
allowed_domains = ["bbc.co.uk"]
start_urls = ["http://www.bbc.co.uk/news/technology/", ]
custom_settings = {
'LOG_LEVEL': logging.WARNING,
'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1} # Used for pipeline
}
rules = [Rule(LinkExtractor(allow=['/technology-\d+']), 'parse_story')]
def parse_story(self, response):
story = NewsItem()
story['headline'] = response.xpath('.//h1[@class="story-body__h1"]/text()').get()
story['intro'] = response.xpath('.//p[@class="story-body__introduction"]/text()').get()
yield {
"headline": story['headline'],
"intro": story['intro']
}
from scrapy.crawler import CrawlerProcess
hgw_crawler_process = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
hgw_crawler_process.crawl(NewsSpider)
hgw_crawler_process.start()
JSON File Generated