匯入資料至全文檢索 (Solr)

操作圖智慧家庭 詞頻分群


XML 資料結構

<add>
	<doc>
		<!-- 商品服務提供商,由程式自動產生 -->
		<field name="id"></field>

		<!-- 商品服務提供商名稱 -->
		<field name="author">先進整合</field>
		
		<!-- 商品服務提供商概述 -->
		<field name="comments">致力以完整扎實的技術根基,結合創新思維,協助客戶將科技適切地融入居家環境與個人生活。</field>

		<!-- 商品服務網址 -->
		<field name="url">http://www.i2.com.tw/index.php/services</field>

		<!-- 商品服務分類 -->
		<field name="sku">智慧化住宅解決方案</field>

		<!-- 商品服務名稱 -->
		<field name="name">智慧化住宅解決方案</field>

		<!-- 商品服務概述 -->
		<field name="description">住宅智慧控制、燈光/能源管理控制</field>

		<!-- 商品服務功能 -->
		<field name="features"></field>

		<!-- 商品服務關鍵字 -->
		<field name="keywords">門禁控管</field>
		
		<!-- 智慧家庭應用分類 -->
		<field name="cat">居家安全</field>
		<field name="cat">環境監控</field>
	</doc>
</add>

全文檢索


程式碼

# -*- coding: utf-8 -*-

import sys
import glob
import uuid
import xml.etree.ElementTree as ET
import requests


class VendorInfo:
    solrHost = 'localhost'
    solrPort = 8983
    solrCollection = 'collection1'

    def __init__(self, solrCollection='collection1', solrHost='localhost', solrPort=8983):
        self.solrHost = solrHost
        self.solrPort = solrPort
        self.solrCollection = solrCollection

    def ValidateData(self, xmlFile):
        tree = ET.parse(xmlFile)
        root = tree.getroot()

        vendorInfoID = None

        try:
            for doc in root.findall('doc'):
                vendorID = doc.find("field/[@name='id']")
                if vendorID is None:
                    vendorInfoID = str(uuid.uuid4()).replace('-', '')
                    vendorID.text = vendorInfoID
                else:
                    if vendorID.text is None:
                        vendorInfoID = str(uuid.uuid4()).replace('-', '')
                        vendorID.text = vendorInfoID
                    else:
                        vendorInfoID = vendorID.text
        except:
            print "\n\n%s 錯誤:%s\n\n" % (xmlFile, sys.exc_info()[0])

            vendorInfoID = None

        if vendorInfoID is not None:
            tree.write(xmlFile, encoding="UTF-8", xml_declaration=True)

        return vendorInfoID

    def CheckResponse(self, responseXML):
        root = ET.fromstring(responseXML)
        statusPOST = -1

        for lstTag in root.findall('lst'):
            for inTag in lstTag:
                if inTag.attrib['name'] == 'status':
                    statusPOST = int(inTag.text)

        return (statusPOST == 0)

    def RemoveALL(self):
        urlSolr = "http://%s:%d/solr/%s/update" % (self.solrHost, self.solrPort, self.solrCollection)
        headersHTML = {"content-type": "text/xml;charset=utf-8"}
        paramsHTML = {"commit": "true"}

        xmlContent = "<delete><query>*:*</query></delete>"
        responsePOST = requests.post(urlSolr, data=xmlContent, params=paramsHTML, headers=headersHTML)

        return self.CheckResponse(responsePOST.text)

    def DoTask(self, xmlFile):
        urlSolr = "http://%s:%d/solr/%s/update" % (self.solrHost, self.solrPort, self.solrCollection)
        headersHTML = {"content-type": "text/xml;charset=utf-8"}
        paramsHTML = {"commit": "true"}

        tree = ET.parse(xmlFile)
        root = tree.getroot()

        xmlContent = ET.tostring(root, encoding="utf-8", method="xml")
        responsePOST = requests.post(urlSolr, data=xmlContent, params=paramsHTML, headers=headersHTML)

        return self.CheckResponse(responsePOST.text)


if __name__ == '__main__':
    vendorInfo = VendorInfo(solrCollection='SmartHome')
    dataPath = '.'

    if len(sys.argv) > 1:
        dataPath = sys.argv[1]

    taskJob = [True, False, False, True]

    if taskJob[0]:
        print u'刪除所有文件:',
        if vendorInfo.RemoveALL():
            print u'成功'
        else:
            print u'失敗'

    # 匯入單一資料檔案
    if taskJob[1]:
        dataFile = dataPath
        vendorInfoID = vendorInfo.ValidateData(dataFile)
        if vendorInfoID is not None:
            if vendorInfo.DoTask(dataFile):
                print "%s\n\t%s" % (dataFile, vendorInfoID)
            else:
                print "%s\n\t%s" % (dataFile, 'X')
        else:
            print "%s\n\t%s" % (dataFile, 'X')

    # 匯入符合查詢條件之資料檔案
    if taskJob[2]:
        for dataFile in glob.glob("%s*.xml" % (dataPath)):
            if dataFile != '.' and dataFile != '..':
                vendorInfoID = vendorInfo.ValidateData(dataFile)
                if vendorInfoID is not None:
                    if vendorInfo.DoTask(dataFile):
                        print "%s\n\t%s" % (dataFile, vendorInfoID)
                    else:
                        print "%s\n\t%s" % (dataFile, 'X')
                else:
                    print "%s\n\t%s" % (dataFile, 'X')

    # 匯入指定目錄下所有資料檔案
    if taskJob[3]:
        for dataFile in glob.glob("%s/*.xml" % (dataPath)):
            if dataFile != '.' and dataFile != '..':
                vendorInfoID = vendorInfo.ValidateData(dataFile)
                if vendorInfoID is not None:
                    if vendorInfo.DoTask(dataFile):
                        print "%s\n\t%s" % (dataFile, vendorInfoID)
                    else:
                        print "%s\n\t%s" % (dataFile, 'X')
                else:
                    print "%s\n\t%s" % (dataFile, 'X')
ċ
061-先進整合.xml
(1k)
李智,
2015年4月25日 上午2:57
ċ
VendorInfoSolr.py
(4k)
李智,
2015年4月25日 上午3:02