使用Scrapy抓取台股證交所每日股價資料,儲存到MongoDB

使用Scrapy抓取台股證交所每日股價資料,儲存到MongoDB

本程式完整程式碼在GitHub,網址如下

https://github.com/jang0820/scrapy/tree/master/twse_mongo

Step0)在Mongo內建立資料庫STwStock,新增collection為twse

Step1)編輯twse_mongo\spiders\twse.py,scrapy使用start_urls的網址抓取資料,本程式使用函式__init__動態建立start_urls,瀏覽start_urls後,會自動呼叫函式parse,函式parse處理回傳的股票JSON格式的資料,透過函式transform將資料進行格式轉換,最後儲存到item物件,scrapy經由設定會將item物件交給pipelines處理,在pipelines撰寫程式將資料加到MongoDB資料庫

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73

import scrapyimport jsonfrom twse_mongo.items import TwseMongoItem import timeimport datetimeclass TwseSpider(scrapy.Spider): name = 'twse' allowed_domains = ['www.twse.com.tw'] start_urls = [] def __init__(self): dates = [] years = [2018] today = datetime.datetime.now() for y in years: if y < int(today.year): for m in range(1, 13): #產生去年以前的年與月 if m < 10: s = str(y) + '0' + str(m) +'01' else: s = str(y) + str(m) +'01' dates.append(s) if y == int(today.year): #產生今年的年與月 for m in range(1, int(today.month)+1): if m < 10: s = str(y) + '0' + str(m) +'01' else: s = str(y) + str(m) +'01' dates.append(s) stockno_list = ['2330'] #要蒐集的股票代碼,可以是串列 for stockno in stockno_list: for date in dates: url = 'http://www.twse.com.tw/exchangeReport/STOCK_DAY?date=%s&stockNo=%s' % ( date, stockno) #產生證交所所需要的股票與日期的網址 self.start_urls.append(url) #加入start_urls,scrapy會自動抓取start_urls的每一個網址 def transform_date(self, date): #民國轉西元 y, m, d = date.split('/') return str(int(y)+1911) + '/' + m + '/' + d def transform_data(self, data): #資料格式轉換 data[0] = datetime.datetime.strptime(self.transform_date(data[0]), '%Y/%m/%d') data[1] = int(data[1].replace(',', ''))#把千進位的逗點去除 data[2] = int(data[2].replace(',', '')) data[3] = float(data[3].replace(',', '')) data[4] = float(data[4].replace(',', '')) data[5] = float(data[5].replace(',', '')) data[6] = float(data[6].replace(',', '')) data[7] = float(0.0 if data[7].replace(',', '') == 'X0.00' else data[7].replace(',', '')) # +/-/X表示漲/跌/不比價 data[8] = int(data[8].replace(',', '')) return data def transform(self, data): #取出data的每一列資料進行資料格式轉換 return [self.transform_data(d) for d in data] def parse(self, response): data_src = json.loads(response.body_as_unicode()) #證交所回傳資料為json,使用json.loads將json轉換成python的字典結構 stockno = response.url[-4:] #取出股票代碼 item = TwseMongoItem() #TwseMongoItem定義在items.py data = self.transform(data_src['data']) #字典data_src的鍵值data對應到整個月股價與成交量,使用transform進行資料格式轉換 for d in data: item['date'] = d[0] #資料與item結合,會傳到pipeline進行處理 item['stockno'] = stockno item['shares'] = d[1] item['amount'] = d[2] item['open'] = d[3] item['close'] = d[4] item['high'] = d[5] item['low'] = d[6] item['diff'] = d[7] item['turnover'] = d[8] yield item

Step2)修改twse_mongo\item.py,建立資料接收的資料欄位

1 2 3 4 5 6 7 8 9 10 11 12 13

class TwseMongoItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() date = scrapy.Field() stockno = scrapy.Field() shares = scrapy.Field() amount = scrapy.Field() open = scrapy.Field() close = scrapy.Field() high = scrapy.Field() low = scrapy.Field() diff = scrapy.Field() turnover = scrapy.Field()

Step3)修改twse_mongo\pipelines.py,將item插入到資料庫

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17

import pymongofrom twse_mongo import settings from twse_mongo.items import TwseMongoItem from pymongo import MongoClient class TwseMongoPipeline(object): def __init__(self): #連線資料庫,資料庫相關設定值放在settings.py self.client = MongoClient(settings.MONGO_HOST, 27017) self.db = self.client[settings.MONGO_DB] self.collection = self.db[settings.MONGO_COLLETION] def process_item(self, item, spider): if item.__class__ == TwseMongoItem: #將不同Item插入不同的資料庫 if self.collection.find({"date": item['date'], "stockno": item['stockno']} ).count() == 0: #找尋資料是否已經在Mongo element={'date':item['date'], 'stockno':item['stockno'], 'shares':item['shares'], 'amount':item['amount'], 'open':item['open'], 'close':item['close'], 'high':item['high'], 'low':item['low'], 'diff':item['diff'], 'turnover':item['turnover']}; #一天的股價與成交量 self.collection.insert_one(element) #將資料插入到資料庫 return item

Step4)本專案設定檔在twse_mongo\settings.py,新增資料庫設定,設定抓取網頁的延遲時間,啟用ITEM_PIPELINES才會將item儲存到Mongo資料庫

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15

BOT_NAME = 'twse_mongo' SPIDER_MODULES = ['twse_mongo.spiders'] NEWSPIDER_MODULE = 'twse_mongo.spiders' MONGO_HOST = 'localhost' MONGO_DB = 'STwStock' MONGO_COLLETION = 'twse' #相當於資料表 DOWNLOAD_DELAY = 5 #證交所有限制每個IP單位時間存取的資料量,需要delay,避免無法瀏覽證交所網頁,需要一陣子才能夠恢復連線 CONCURRENT_REQUESTS_PER_DOMAIN = 2 ITEM_PIPELINES = { 'twse_mongo.pipelines.TwseMongoPipeline': 300, }

Step5)如果從GitHub下載程式碼,在twse_mongo資料夾執行指令「scrapy crawl twse」就可以下載股票資料,最後將資料加到Mongo資料庫STwStock的資料表twse中

使用MongoDB Compass Community查詢資料庫STwStock的資料表twse,如下圖。