詞頻分群 (處理 Big5)

說明

    • 處理 Windows 繁體中文 DOS 視窗環境下:
      • 輸出問題 (UTF-8 -> Big5)
      • 命令列輸入問題 (Big5 -> UTF-8)
    • 關鍵字查詢 Solr
    • 計算詞頻
    • 計算最重要詞

程式碼

# -*- coding: utf-8 -*- import json import urllib2 import sys from operator import itemgetter class SolrTool: solrURL = "http://%s:%d/solr/%s/" solrQuery = "select?q=%s&df=%s&wt=json&indent=true" stopWords = [u'相關', u'主要', u'現有', u'成為'] def __init__(self, hostName='localhost', portNum=8983, collectionName='MICRONIX'): self.solrURL = self.solrURL % (hostName, portNum, collectionName) # 查詢 Solr def Query(self, queryTerm, fieldName='news_t'): queryTerm = urllib2.quote(queryTerm) queryURL = self.solrQuery % (queryTerm, fieldName) queryURL = '%s%s' % (self.solrURL, queryURL) jsonContent = None try: solrResult = urllib2.urlopen(queryURL) jsonContent = json.load(solrResult) except: print queryURL return jsonContent # 解讀 Solr 傳回值 def getResponse(self, jsonContent, fieldName='news_t'): if jsonContent is None: return None responseHeader = jsonContent['responseHeader'] response = jsonContent['response'] docs = response['docs'] dataResult = { 'status': responseHeader['status'], 'numFound': response['numFound'], 'docs': docs } return dataResult # 計算詞頻 def TermFreq(self, dataSet): txtContent = [] for doc in dataSet: news_t = doc['news_t'] txtContent.append(news_t) termList = (' '.join(txtContent)).split() termDict = {} for term in termList: if len(term) > 1: if term not in self.stopWords: try: termDict[term] += 1.0 except: termDict[term] = 1.0 return termDict # 計算最重要詞 def TopClusterTermFreq(self, dictTermFreq, numCluster=3): totalFreq = sum(dictTermFreq.values()) areaOfCuster = totalFreq / numCluster termFreqList = sorted(dictTermFreq.items(), key=itemgetter(1), reverse=True) topResult = [] for term in termFreqList: topResult.append([term[0], term[1]]) areaOfCuster -= term[1] if areaOfCuster <= 0: break return topResult # 列印最重要詞 def PrintTopCluster(self, topResult): for term in topResult: print "%s\t%d" % (term[0].encode('big5'), term[1]) # 列印詞頻 def PrintTermFreq(self, dictTermFreq): totalFreq = sum(dictTermFreq.values()) termFreqList = sorted(dictTermFreq.items(), key=itemgetter(1), reverse=True) for term in termFreqList: if term[1] > 1: ratio = float(term[1]) / totalFreq print '%8s\t%2d\t%.4f' % (term[0].encode('big5'), term[1], ratio) # 列印裸資料 def PrintRawResponse(self, jsonContent): objList = json.load(jsonContent) for w in objList: for w2 in w: k = str(objList[w2]) print k.encode('big5') if __name__ == "__main__": if len(sys.argv) < 1: sys.exit() worker = SolrTool() solrTerm = sys.argv[1] solrTerm = solrTerm.decode('big5').encode('utf-8') jsonContent = worker.Query(solrTerm) dataResult = worker.getResponse(jsonContent) if dataResult is None: sys.exit() taskControl = [False, False, True] if taskControl[0]: print dataResult['status'], dataResult['numFound'] for w in dataResult['docs']: idDoc = w['id'] news_t = w['news_t'] print '\t', idDoc print '\t', news_t.encode('big5') if taskControl[1]: dictTermFreq = worker.TermFreq(dataResult['docs']) worker.PrintTermFreq(dictTermFreq) if taskControl[2]: dictTermFreq = worker.TermFreq(dataResult['docs']) topResult = worker.TopClusterTermFreq(dictTermFreq) worker.PrintTopCluster(topResult)