匯入資料至全文檢索 (Solr)

XML 資料結構

<add> <doc> <!-- 商品服務提供商,由程式自動產生 --> <field name="id"></field> <!-- 商品服務提供商名稱 --> <field name="author">先進整合</field> <!-- 商品服務提供商概述 --> <field name="comments">致力以完整扎實的技術根基,結合創新思維,協助客戶將科技適切地融入居家環境與個人生活。</field> <!-- 商品服務網址 --> <field name="url">http://www.i2.com.tw/index.php/services</field> <!-- 商品服務分類 --> <field name="sku">智慧化住宅解決方案</field> <!-- 商品服務名稱 --> <field name="name">智慧化住宅解決方案</field> <!-- 商品服務概述 --> <field name="description">住宅智慧控制、燈光/能源管理控制</field> <!-- 商品服務功能 --> <field name="features"></field> <!-- 商品服務關鍵字 --> <field name="keywords">門禁控管</field> <!-- 智慧家庭應用分類 --> <field name="cat">居家安全</field> <field name="cat">環境監控</field> </doc> </add>

全文檢索

程式碼

# -*- coding: utf-8 -*- import sys import glob import uuid import xml.etree.ElementTree as ET import requests class VendorInfo: solrHost = 'localhost' solrPort = 8983 solrCollection = 'collection1' def __init__(self, solrCollection='collection1', solrHost='localhost', solrPort=8983): self.solrHost = solrHost self.solrPort = solrPort self.solrCollection = solrCollection def ValidateData(self, xmlFile): tree = ET.parse(xmlFile) root = tree.getroot() vendorInfoID = None try: for doc in root.findall('doc'): vendorID = doc.find("field/[@name='id']") if vendorID is None: vendorInfoID = str(uuid.uuid4()).replace('-', '') vendorID.text = vendorInfoID else: if vendorID.text is None: vendorInfoID = str(uuid.uuid4()).replace('-', '') vendorID.text = vendorInfoID else: vendorInfoID = vendorID.text except: print "\n\n%s 錯誤:%s\n\n" % (xmlFile, sys.exc_info()[0]) vendorInfoID = None if vendorInfoID is not None: tree.write(xmlFile, encoding="UTF-8", xml_declaration=True) return vendorInfoID def CheckResponse(self, responseXML): root = ET.fromstring(responseXML) statusPOST = -1 for lstTag in root.findall('lst'): for inTag in lstTag: if inTag.attrib['name'] == 'status': statusPOST = int(inTag.text) return (statusPOST == 0) def RemoveALL(self): urlSolr = "http://%s:%d/solr/%s/update" % (self.solrHost, self.solrPort, self.solrCollection) headersHTML = {"content-type": "text/xml;charset=utf-8"} paramsHTML = {"commit": "true"} xmlContent = "<delete><query>*:*</query></delete>" responsePOST = requests.post(urlSolr, data=xmlContent, params=paramsHTML, headers=headersHTML) return self.CheckResponse(responsePOST.text) def DoTask(self, xmlFile): urlSolr = "http://%s:%d/solr/%s/update" % (self.solrHost, self.solrPort, self.solrCollection) headersHTML = {"content-type": "text/xml;charset=utf-8"} paramsHTML = {"commit": "true"} tree = ET.parse(xmlFile) root = tree.getroot() xmlContent = ET.tostring(root, encoding="utf-8", method="xml") responsePOST = requests.post(urlSolr, data=xmlContent, params=paramsHTML, headers=headersHTML) return self.CheckResponse(responsePOST.text) if __name__ == '__main__': vendorInfo = VendorInfo(solrCollection='SmartHome') dataPath = '.' if len(sys.argv) > 1: dataPath = sys.argv[1] taskJob = [True, False, False, True] if taskJob[0]: print u'刪除所有文件:', if vendorInfo.RemoveALL(): print u'成功' else: print u'失敗' # 匯入單一資料檔案 if taskJob[1]: dataFile = dataPath vendorInfoID = vendorInfo.ValidateData(dataFile) if vendorInfoID is not None: if vendorInfo.DoTask(dataFile): print "%s\n\t%s" % (dataFile, vendorInfoID) else: print "%s\n\t%s" % (dataFile, 'X') else: print "%s\n\t%s" % (dataFile, 'X') # 匯入符合查詢條件之資料檔案 if taskJob[2]: for dataFile in glob.glob("%s*.xml" % (dataPath)): if dataFile != '.' and dataFile != '..': vendorInfoID = vendorInfo.ValidateData(dataFile) if vendorInfoID is not None: if vendorInfo.DoTask(dataFile): print "%s\n\t%s" % (dataFile, vendorInfoID) else: print "%s\n\t%s" % (dataFile, 'X') else: print "%s\n\t%s" % (dataFile, 'X') # 匯入指定目錄下所有資料檔案 if taskJob[3]: for dataFile in glob.glob("%s/*.xml" % (dataPath)): if dataFile != '.' and dataFile != '..': vendorInfoID = vendorInfo.ValidateData(dataFile) if vendorInfoID is not None: if vendorInfo.DoTask(dataFile): print "%s\n\t%s" % (dataFile, vendorInfoID) else: print "%s\n\t%s" % (dataFile, 'X') else: print "%s\n\t%s" % (dataFile, 'X')