import lxml.etreeimport json# receives the extract result from the spider and appends them into a JSON Line file, (each line is a json)class JsonWriterPipeline(object): def open_spider(self, spider): self.file = open('newsresult.json', 'w') def close_spider(self, spider): print('JSON File Generated') self.file.close() def process_item(self, item, spider): line = json.dumps(dict(item)) + "\n" self.file.write(line) return itemimport loggingimport scrapyfrom scrapy.spiders import Rule, CrawlSpiderfrom scrapy.linkextractors import LinkExtractorclass NewsItem(scrapy.Item): # define the fields for your item here like: headline = scrapy.Field() intro = scrapy.Field() # url = scrapy.Field()class NewsSpider(CrawlSpider): name = "bbcnews" allowed_domains = ["bbc.co.uk"] start_urls = ["http://www.bbc.co.uk/news/technology/", ] custom_settings = { 'LOG_LEVEL': logging.WARNING, 'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1} # Used for pipeline } rules = [Rule(LinkExtractor(allow=['/technology-\d+']), 'parse_story')] def parse_story(self, response): story = NewsItem() story['headline'] = response.xpath('.//h1[@class="story-body__h1"]/text()').get() story['intro'] = response.xpath('.//p[@class="story-body__introduction"]/text()').get() yield { "headline": story['headline'], "intro": story['intro'] }from scrapy.crawler import CrawlerProcesshgw_crawler_process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.1)'})hgw_crawler_process.crawl(NewsSpider)hgw_crawler_process.start()JSON File Generated