import lxml.etreeimport json# receives the extract result from the spider and prints out the contentclass ConsoleWriterPipeline(object): def open_spider(self, spider): None def close_spdier(self, spider): None def process_item(self, item, spider): line = json.dumps(dict(item)) + "\n" print(line) return itemimport loggingimport scrapyfrom scrapy.crawler import CrawlerProcessclass QuotesSpider(scrapy.Spider): name = "quotes" start_urls = [ 'http://quotes.toscrape.com/page/1/', 'http://quotes.toscrape.com/page/2/', ] custom_settings = { 'LOG_LEVEL': logging.WARNING, # Default : Debug 'ITEM_PIPELINES': {'__main__.ConsoleWriterPipeline': 1} # Used for pipeline } def parse(self, response): for quote in response.css('div.quote'): yield { 'text': quote.css('span.text::text').get(), 'author': quote.css('span small::text').get(), 'tags': quote.css('div.tags a.tag::text').getall(), }quotes_crawler_process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.1)'})quotes_crawler_process.crawl(QuotesSpider)quotes_crawler_process.start()2020-05-31 16:17:00 [scrapy.utils.log] INFO: Scrapy 2.1.0 started (bot: scrapybot)
2020-05-31 16:17:00 [scrapy.utils.log] INFO: Versions: lxml 4.5.1.0, libxml2 2.9.5, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.7.5 (tags/v3.7.5:5c02a39a0b, Oct 15 2019, 00:11:34) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1g 21 Apr 2020), cryptography 2.9.2, Platform Windows-10-10.0.18362-SP0
2020-05-31 16:17:00 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2020-05-31 16:17:00 [scrapy.crawler] INFO: Overridden settings:
{'LOG_LEVEL': 30,
'USER_AGENT': 'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
{"text": "\u201cIt takes a great deal of bravery to stand up to our enemies, but just as much to stand up to our friends.\u201d", "author": "J.K. Rowling", "tags": ["courage", "friends"]}
{"text": "\u201cIf you can't explain it to a six year old, you don't understand it yourself.\u201d", "author": "Albert Einstein", "tags": ["simplicity", "understand"]}
{"text": "\u201cI like nonsense, it wakes up the brain cells. Fantasy is a necessary ingredient in living.\u201d", "author": "Dr. Seuss", "tags": ["fantasy"]}
{"text": "\u201cI may not have gone where I intended to go, but I think I have ended up where I needed to be.\u201d", "author": "Douglas Adams", "tags": ["life", "navigation"]}
import lxml.etreeimport json# receives the extract result from the spider and prints out the contentclass ConsoleWriterPipeline(object): def open_spider(self, spider): None def close_spdier(self, spider): None def process_item(self, item, spider): line = json.dumps(dict(item)) + "\n" print(line) return itemimport loggingimport scrapyfrom scrapy.spiders import Rule, CrawlSpiderfrom scrapy.linkextractors import LinkExtractorclass NewsItem(scrapy.Item): # define the fields for your item here like: headline = scrapy.Field() intro = scrapy.Field() # url = scrapy.Field()class NewsSpider(CrawlSpider): name = "bbcnews" allowed_domains = ["bbc.co.uk"] start_urls = ["http://www.bbc.co.uk/news/technology/", ] custom_settings = { 'LOG_LEVEL': logging.WARNING, 'ITEM_PIPELINES': {'__main__.ConsoleWriterPipeline': 1} # Used for pipeline } rules = [Rule(LinkExtractor(allow=['/technology-\d+']), 'parse_story')] def parse_story(self, response): story = NewsItem() story['headline'] = response.xpath('.//h1[@class="story-body__h1"]/text()').get() story['intro'] = response.xpath('.//p[@class="story-body__introduction"]/text()').get() yield { "headline": story['headline'], "intro": story['intro'] }from scrapy.crawler import CrawlerProcesshgw_crawler_process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.1)'})hgw_crawler_process.crawl(NewsSpider)hgw_crawler_process.start()2020-05-31 16:41:32 [scrapy.utils.log] INFO: Scrapy 2.1.0 started (bot: scrapybot)
2020-05-31 16:41:32 [scrapy.utils.log] INFO: Versions: lxml 4.5.1.0, libxml2 2.9.5, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.7.5 (tags/v3.7.5:5c02a39a0b, Oct 15 2019, 00:11:34) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1g 21 Apr 2020), cryptography 2.9.2, Platform Windows-10-10.0.18362-SP0
2020-05-31 16:41:32 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2020-05-31 16:41:32 [scrapy.crawler] INFO: Overridden settings:
{'LOG_LEVEL': 30,
'USER_AGENT': 'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
{"headline": "Uber destroys thousands of bikes and scooters", "intro": "Uber is destroying thousands of electric bikes and scooters, after selling its Jump business to Lime."}
{"headline": "Tech Tent: Trump versus Twitter", "intro": "For years, Twitter resisted calls to treat President Trump just like any other user. Then this week, everything changed."}
{"headline": "Huawei: What would happen if the UK ditched the Chinese firm?", "intro": "Huawei's future in the UK is in doubt - again."}
{"headline": "Facebook dominates cases of recorded social media grooming", "intro": "Police in England and Wales recorded more than 10,000 online grooming offences on social media over two-and-a-half years."}
import lxml.etreeimport json# receives the extract result from the spider and prints out the contentclass ConsoleWriterPipeline(object): def open_spider(self, spider): None def close_spdier(self, spider): None def process_item(self, item, spider): line = json.dumps(dict(item)) + "\n" print(line) return itemimport loggingimport scrapyfrom scrapy.spiders import CrawlSpider, Rulefrom scrapy.linkextractors import LinkExtractorclass BooksCrawlSpider(CrawlSpider): name = 'books-crawlspider' allowed_domains = ['toscrape.com'] start_urls = ['http://books.toscrape.com'] custom_settings = { 'LOG_LEVEL': logging.WARNING, 'ITEM_PIPELINES': {'__main__.ConsoleWriterPipeline': 1}#, # Used for pipeline } rules = [ Rule( LinkExtractor(allow=('/catalogue/page-\d+.html')),follow=True ), Rule( # LinkExtractor(deny=('/category/books', '/catalogue/page-\d+.html', '.com/index.html')), LinkExtractor(deny=('/category/books', '.com/index.html')),callback='parse_book_page', follow=True ), ] def parse_book_page(self, response): yield { 'title': response.css('.product_main h1::text').get(), 'price': response.css('.product_main p.price_color::text').re_first('£(.*)'), 'stock': int(''.join(response.css('.product_main p.instock.availability::text').re('(\d+)'))) }from scrapy.crawler import CrawlerProcesshgw_crawler_process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.1)'})hgw_crawler_process.crawl(BooksCrawlSpider)hgw_crawler_process.start()2020-05-31 16:50:02 [scrapy.utils.log] INFO: Scrapy 2.1.0 started (bot: scrapybot)
2020-05-31 16:50:02 [scrapy.utils.log] INFO: Versions: lxml 4.5.1.0, libxml2 2.9.5, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.7.5 (tags/v3.7.5:5c02a39a0b, Oct 15 2019, 00:11:34) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1g 21 Apr 2020), cryptography 2.9.2, Platform Windows-10-10.0.18362-SP0
2020-05-31 16:50:02 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2020-05-31 16:50:02 [scrapy.crawler] INFO: Overridden settings:
{'LOG_LEVEL': 30,
'USER_AGENT': 'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
{"title": "Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991", "price": "57.25", "stock": 19}
{"title": "Olio", "price": "23.88", "stock": 19}
{"title": "A Light in the Attic", "price": "51.77", "stock": 22}
{"title": "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)", "price": "52.29", "stock": 19}
{"title": "It's Only the Himalayas", "price": "45.17", "stock": 19}
{"title": "Libertarianism for Beginners", "price": "51.33", "stock": 19}
{"title": "Mesaerion: The Best Science Fiction Stories 1800-1849", "price": "37.59", "stock": 19}