import lxml.etree
import json
# receives the extract result from the spider and prints out the content
class ConsoleWriterPipeline(object):
def open_spider(self, spider):
None
def close_spdier(self, spider):
None
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
print(line)
return item
import logging
import scrapy
from scrapy.crawler import CrawlerProcess
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'http://quotes.toscrape.com/page/1/',
'http://quotes.toscrape.com/page/2/',
]
custom_settings = {
'LOG_LEVEL': logging.WARNING, # Default : Debug
'ITEM_PIPELINES': {'__main__.ConsoleWriterPipeline': 1} # Used for pipeline
}
def parse(self, response):
for quote in response.css('div.quote'):
yield {
'text': quote.css('span.text::text').get(),
'author': quote.css('span small::text').get(),
'tags': quote.css('div.tags a.tag::text').getall(),
}
quotes_crawler_process = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
quotes_crawler_process.crawl(QuotesSpider)
quotes_crawler_process.start()
2020-05-31 16:17:00 [scrapy.utils.log] INFO: Scrapy 2.1.0 started (bot: scrapybot)
2020-05-31 16:17:00 [scrapy.utils.log] INFO: Versions: lxml 4.5.1.0, libxml2 2.9.5, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.7.5 (tags/v3.7.5:5c02a39a0b, Oct 15 2019, 00:11:34) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1g 21 Apr 2020), cryptography 2.9.2, Platform Windows-10-10.0.18362-SP0
2020-05-31 16:17:00 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2020-05-31 16:17:00 [scrapy.crawler] INFO: Overridden settings:
{'LOG_LEVEL': 30,
'USER_AGENT': 'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
{"text": "\u201cIt takes a great deal of bravery to stand up to our enemies, but just as much to stand up to our friends.\u201d", "author": "J.K. Rowling", "tags": ["courage", "friends"]}
{"text": "\u201cIf you can't explain it to a six year old, you don't understand it yourself.\u201d", "author": "Albert Einstein", "tags": ["simplicity", "understand"]}
{"text": "\u201cI like nonsense, it wakes up the brain cells. Fantasy is a necessary ingredient in living.\u201d", "author": "Dr. Seuss", "tags": ["fantasy"]}
{"text": "\u201cI may not have gone where I intended to go, but I think I have ended up where I needed to be.\u201d", "author": "Douglas Adams", "tags": ["life", "navigation"]}
import lxml.etree
import json
# receives the extract result from the spider and prints out the content
class ConsoleWriterPipeline(object):
def open_spider(self, spider):
None
def close_spdier(self, spider):
None
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
print(line)
return item
import logging
import scrapy
from scrapy.spiders import Rule, CrawlSpider
from scrapy.linkextractors import LinkExtractor
class NewsItem(scrapy.Item):
# define the fields for your item here like:
headline = scrapy.Field()
intro = scrapy.Field()
# url = scrapy.Field()
class NewsSpider(CrawlSpider):
name = "bbcnews"
allowed_domains = ["bbc.co.uk"]
start_urls = ["http://www.bbc.co.uk/news/technology/", ]
custom_settings = {
'LOG_LEVEL': logging.WARNING,
'ITEM_PIPELINES': {'__main__.ConsoleWriterPipeline': 1} # Used for pipeline
}
rules = [Rule(LinkExtractor(allow=['/technology-\d+']), 'parse_story')]
def parse_story(self, response):
story = NewsItem()
story['headline'] = response.xpath('.//h1[@class="story-body__h1"]/text()').get()
story['intro'] = response.xpath('.//p[@class="story-body__introduction"]/text()').get()
yield {
"headline": story['headline'],
"intro": story['intro']
}
from scrapy.crawler import CrawlerProcess
hgw_crawler_process = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
hgw_crawler_process.crawl(NewsSpider)
hgw_crawler_process.start()
2020-05-31 16:41:32 [scrapy.utils.log] INFO: Scrapy 2.1.0 started (bot: scrapybot)
2020-05-31 16:41:32 [scrapy.utils.log] INFO: Versions: lxml 4.5.1.0, libxml2 2.9.5, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.7.5 (tags/v3.7.5:5c02a39a0b, Oct 15 2019, 00:11:34) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1g 21 Apr 2020), cryptography 2.9.2, Platform Windows-10-10.0.18362-SP0
2020-05-31 16:41:32 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2020-05-31 16:41:32 [scrapy.crawler] INFO: Overridden settings:
{'LOG_LEVEL': 30,
'USER_AGENT': 'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
{"headline": "Uber destroys thousands of bikes and scooters", "intro": "Uber is destroying thousands of electric bikes and scooters, after selling its Jump business to Lime."}
{"headline": "Tech Tent: Trump versus Twitter", "intro": "For years, Twitter resisted calls to treat President Trump just like any other user. Then this week, everything changed."}
{"headline": "Huawei: What would happen if the UK ditched the Chinese firm?", "intro": "Huawei's future in the UK is in doubt - again."}
{"headline": "Facebook dominates cases of recorded social media grooming", "intro": "Police in England and Wales recorded more than 10,000 online grooming offences on social media over two-and-a-half years."}
import lxml.etree
import json
# receives the extract result from the spider and prints out the content
class ConsoleWriterPipeline(object):
def open_spider(self, spider):
None
def close_spdier(self, spider):
None
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
print(line)
return item
import logging
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class BooksCrawlSpider(CrawlSpider):
name = 'books-crawlspider'
allowed_domains = ['toscrape.com']
start_urls = ['http://books.toscrape.com']
custom_settings = {
'LOG_LEVEL': logging.WARNING,
'ITEM_PIPELINES': {'__main__.ConsoleWriterPipeline': 1}#, # Used for pipeline
}
rules = [
Rule(
LinkExtractor(allow=('/catalogue/page-\d+.html')),follow=True
),
Rule(
# LinkExtractor(deny=('/category/books', '/catalogue/page-\d+.html', '.com/index.html')),
LinkExtractor(deny=('/category/books', '.com/index.html')),callback='parse_book_page',
follow=True
),
]
def parse_book_page(self, response):
yield {
'title': response.css('.product_main h1::text').get(),
'price': response.css('.product_main p.price_color::text').re_first('£(.*)'),
'stock': int(''.join(response.css('.product_main p.instock.availability::text').re('(\d+)')))
}
from scrapy.crawler import CrawlerProcess
hgw_crawler_process = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
hgw_crawler_process.crawl(BooksCrawlSpider)
hgw_crawler_process.start()
2020-05-31 16:50:02 [scrapy.utils.log] INFO: Scrapy 2.1.0 started (bot: scrapybot)
2020-05-31 16:50:02 [scrapy.utils.log] INFO: Versions: lxml 4.5.1.0, libxml2 2.9.5, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.7.5 (tags/v3.7.5:5c02a39a0b, Oct 15 2019, 00:11:34) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1g 21 Apr 2020), cryptography 2.9.2, Platform Windows-10-10.0.18362-SP0
2020-05-31 16:50:02 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2020-05-31 16:50:02 [scrapy.crawler] INFO: Overridden settings:
{'LOG_LEVEL': 30,
'USER_AGENT': 'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
{"title": "Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991", "price": "57.25", "stock": 19}
{"title": "Olio", "price": "23.88", "stock": 19}
{"title": "A Light in the Attic", "price": "51.77", "stock": 22}
{"title": "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)", "price": "52.29", "stock": 19}
{"title": "It's Only the Himalayas", "price": "45.17", "stock": 19}
{"title": "Libertarianism for Beginners", "price": "51.33", "stock": 19}
{"title": "Mesaerion: The Best Science Fiction Stories 1800-1849", "price": "37.59", "stock": 19}