8. Web Scrapping (JsonWriterPipeline)

import lxml.etree

import json

# receives the extract result from the spider and appends them into a JSON Line file, (each line is a json)

class JsonWriterPipeline(object):

    def open_spider(self, spider):

        self.file = open('newsresult.json', 'w')

    def close_spider(self, spider):

        print('JSON File Generated')

        self.file.close()

    def process_item(self, item, spider):

        line = json.dumps(dict(item)) + "\n"

        self.file.write(line)

        return item

import logging

import scrapy

from scrapy.spiders import Rule, CrawlSpider

from scrapy.linkextractors import LinkExtractor

class NewsItem(scrapy.Item):

    # define the fields for your item here like:

    headline = scrapy.Field()

    intro = scrapy.Field()

    # url = scrapy.Field()

class NewsSpider(CrawlSpider):

    name = "bbcnews"

    allowed_domains = ["bbc.co.uk"]

    start_urls = ["http://www.bbc.co.uk/news/technology/", ]

    custom_settings = {

        'LOG_LEVEL': logging.WARNING,

        'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1}  # Used for pipeline

    rules = [Rule(LinkExtractor(allow=['/technology-\d+']), 'parse_story')]

    def parse_story(self, response):

        story = NewsItem()

        story['headline'] = response.xpath('.//h1[@class="story-body__h1"]/text()').get()

        story['intro'] = response.xpath('.//p[@class="story-body__introduction"]/text()').get()

        yield {

            "headline": story['headline'],

            "intro": story['intro']

from scrapy.crawler import CrawlerProcess

hgw_crawler_process = CrawlerProcess({

    'USER_AGENT': 'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.1)'

})

hgw_crawler_process.crawl(NewsSpider)

hgw_crawler_process.start()

JSON File Generated

Google Sites

Report abuse