7. Web Scrapping (ConsoleWriterPipeline)

Web Scrapping (ConsoleWriterPipeline) (Focused crawl)

import lxml.etree

import json

# receives the extract result from the spider and prints out the content

class ConsoleWriterPipeline(object):

    def open_spider(self, spider):

        None

    def close_spdier(self, spider):

        None

    def process_item(self, item, spider):

        line = json.dumps(dict(item)) + "\n"

        print(line)

        return item

import logging

import scrapy

from scrapy.crawler import CrawlerProcess

class QuotesSpider(scrapy.Spider):

    name = "quotes"

    start_urls = [

        'http://quotes.toscrape.com/page/1/',

        'http://quotes.toscrape.com/page/2/',

    custom_settings = {

        'LOG_LEVEL': logging.WARNING,  # Default : Debug

        'ITEM_PIPELINES': {'__main__.ConsoleWriterPipeline': 1}  # Used for pipeline

    def parse(self, response):

        for quote in response.css('div.quote'):

            yield {

                'text': quote.css('span.text::text').get(),

                'author': quote.css('span small::text').get(),

                'tags': quote.css('div.tags a.tag::text').getall(),

quotes_crawler_process = CrawlerProcess({

    'USER_AGENT': 'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.1)'

})

quotes_crawler_process.crawl(QuotesSpider)

quotes_crawler_process.start()

2020-05-31 16:17:00 [scrapy.utils.log] INFO: Scrapy 2.1.0 started (bot: scrapybot)

2020-05-31 16:17:00 [scrapy.utils.log] INFO: Versions: lxml 4.5.1.0, libxml2 2.9.5, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.7.5 (tags/v3.7.5:5c02a39a0b, Oct 15 2019, 00:11:34) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1g 21 Apr 2020), cryptography 2.9.2, Platform Windows-10-10.0.18362-SP0

2020-05-31 16:17:00 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor

2020-05-31 16:17:00 [scrapy.crawler] INFO: Overridden settings:

{'LOG_LEVEL': 30,

'USER_AGENT': 'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.1)'}

{"text": "\u201cIt takes a great deal of bravery to stand up to our enemies, but just as much to stand up to our friends.\u201d", "author": "J.K. Rowling", "tags": ["courage", "friends"]}

{"text": "\u201cIf you can't explain it to a six year old, you don't understand it yourself.\u201d", "author": "Albert Einstein", "tags": ["simplicity", "understand"]}

{"text": "\u201cI like nonsense, it wakes up the brain cells. Fantasy is a necessary ingredient in living.\u201d", "author": "Dr. Seuss", "tags": ["fantasy"]}

{"text": "\u201cI may not have gone where I intended to go, but I think I have ended up where I needed to be.\u201d", "author": "Douglas Adams", "tags": ["life", "navigation"]}

Web Scrapping (ConsoleWriterPipeline) (General Crawl - News Crawler)

import lxml.etree

import json

# receives the extract result from the spider and prints out the content

class ConsoleWriterPipeline(object):

    def open_spider(self, spider):

        None

    def close_spdier(self, spider):

        None

    def process_item(self, item, spider):

        line = json.dumps(dict(item)) + "\n"

        print(line)

        return item

import logging

import scrapy

from scrapy.spiders import Rule, CrawlSpider

from scrapy.linkextractors import LinkExtractor

class NewsItem(scrapy.Item):

    # define the fields for your item here like:

    headline = scrapy.Field()

    intro = scrapy.Field()

    # url = scrapy.Field()

class NewsSpider(CrawlSpider):

    name = "bbcnews"

    allowed_domains = ["bbc.co.uk"]

    start_urls = ["http://www.bbc.co.uk/news/technology/", ]

    custom_settings = {

        'LOG_LEVEL': logging.WARNING,

        'ITEM_PIPELINES': {'__main__.ConsoleWriterPipeline': 1}  # Used for pipeline

    rules = [Rule(LinkExtractor(allow=['/technology-\d+']), 'parse_story')]

    def parse_story(self, response):

        story = NewsItem()

        story['headline'] = response.xpath('.//h1[@class="story-body__h1"]/text()').get()

        story['intro'] = response.xpath('.//p[@class="story-body__introduction"]/text()').get()

        yield {

            "headline": story['headline'],

            "intro": story['intro']

from scrapy.crawler import CrawlerProcess

hgw_crawler_process = CrawlerProcess({

    'USER_AGENT': 'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.1)'

})

hgw_crawler_process.crawl(NewsSpider)

hgw_crawler_process.start()

2020-05-31 16:41:32 [scrapy.utils.log] INFO: Scrapy 2.1.0 started (bot: scrapybot)

2020-05-31 16:41:32 [scrapy.utils.log] INFO: Versions: lxml 4.5.1.0, libxml2 2.9.5, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.7.5 (tags/v3.7.5:5c02a39a0b, Oct 15 2019, 00:11:34) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1g 21 Apr 2020), cryptography 2.9.2, Platform Windows-10-10.0.18362-SP0

2020-05-31 16:41:32 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor

2020-05-31 16:41:32 [scrapy.crawler] INFO: Overridden settings:

{'LOG_LEVEL': 30,

'USER_AGENT': 'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.1)'}

{"headline": "Uber destroys thousands of bikes and scooters", "intro": "Uber is destroying thousands of electric bikes and scooters, after selling its Jump business to Lime."}

{"headline": "Tech Tent: Trump versus Twitter", "intro": "For years, Twitter resisted calls to treat President Trump just like any other user. Then this week, everything changed."}

{"headline": "Huawei: What would happen if the UK ditched the Chinese firm?", "intro": "Huawei's future in the UK is in doubt - again."}

{"headline": "Facebook dominates cases of recorded social media grooming", "intro": "Police in England and Wales recorded more than 10,000 online grooming offences on social media over two-and-a-half years."}

Web Scrapping (ConsoleWriterPipeline) (General Crawl - Book Crawler)

import lxml.etree

import json

# receives the extract result from the spider and prints out the content

class ConsoleWriterPipeline(object):

    def open_spider(self, spider):

        None

    def close_spdier(self, spider):

        None

    def process_item(self, item, spider):

        line = json.dumps(dict(item)) + "\n"

        print(line)

        return item

import logging

import scrapy

from scrapy.spiders import CrawlSpider, Rule

from scrapy.linkextractors import LinkExtractor

class BooksCrawlSpider(CrawlSpider):

    name = 'books-crawlspider'

    allowed_domains = ['toscrape.com']

    start_urls = ['http://books.toscrape.com']

    custom_settings = {

      'LOG_LEVEL': logging.WARNING,

      'ITEM_PIPELINES': {'__main__.ConsoleWriterPipeline': 1}#, # Used for pipeline

    rules = [

        Rule(

            LinkExtractor(allow=('/catalogue/page-\d+.html')),follow=True

),

        Rule(

            # LinkExtractor(deny=('/category/books', '/catalogue/page-\d+.html', '.com/index.html')),

            LinkExtractor(deny=('/category/books', '.com/index.html')),callback='parse_book_page',

            follow=True

),

    def parse_book_page(self, response):

      yield {

          'title': response.css('.product_main h1::text').get(),

          'price': response.css('.product_main p.price_color::text').re_first('£(.*)'),

          'stock': int(''.join(response.css('.product_main p.instock.availability::text').re('(\d+)')))

from scrapy.crawler import CrawlerProcess

hgw_crawler_process = CrawlerProcess({

    'USER_AGENT': 'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.1)'

})

hgw_crawler_process.crawl(BooksCrawlSpider)

hgw_crawler_process.start()

2020-05-31 16:50:02 [scrapy.utils.log] INFO: Scrapy 2.1.0 started (bot: scrapybot)

2020-05-31 16:50:02 [scrapy.utils.log] INFO: Versions: lxml 4.5.1.0, libxml2 2.9.5, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.7.5 (tags/v3.7.5:5c02a39a0b, Oct 15 2019, 00:11:34) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1g 21 Apr 2020), cryptography 2.9.2, Platform Windows-10-10.0.18362-SP0

2020-05-31 16:50:02 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor

2020-05-31 16:50:02 [scrapy.crawler] INFO: Overridden settings:

{'LOG_LEVEL': 30,

'USER_AGENT': 'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.1)'}

{"title": "Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991", "price": "57.25", "stock": 19}

{"title": "Olio", "price": "23.88", "stock": 19}

{"title": "A Light in the Attic", "price": "51.77", "stock": 22}

{"title": "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)", "price": "52.29", "stock": 19}

{"title": "It's Only the Himalayas", "price": "45.17", "stock": 19}

{"title": "Libertarianism for Beginners", "price": "51.33", "stock": 19}

{"title": "Mesaerion: The Best Science Fiction Stories 1800-1849", "price": "37.59", "stock": 19}

Google Sites

Report abuse