中文分詞

新聞體中文分詞

#!/usr/bin/python # -*- coding: utf-8 -*- import sys import codecs import string import re import sqlite3 class NewsParser(): DataPath = "" NewsPaper = "" NewsThread = "" NewsDictDB = "" DbConnection = None def __init__(self, DataPath, NewsPaper, NewsThread, NewsDictDB): self.DataPath = DataPath self.NewsPaper = NewsPaper self.NewsThread = NewsThread self.NewsDictDB = NewsDictDB def SeparateChineseDelimiters(self, ChineseWords): ChineseLines = [] WordList1 = ChineseWords.split(u"。") for Words1 in WordList1: WordList2 = Words1.split(u"?") for Words2 in WordList2: WordList3 = Words2.split(u"!") for Words3 in WordList3: WordList4 = Words3.split(u",") for Words4 in WordList4: WordList5 = Words4.split(u"、") for Words5 in WordList5: WordList6 = Words5.split(u":") for Words6 in WordList6: ChineseLines.append(Words6) return(ChineseLines) def SeparateEnglishDelimiters(self, EnglishWords): EnglishLines = [] WordList = re.split("[.|,|?|!]", EnglishWords) for Words in WordList: EnglishLines.append(Words) return(EnglishLines) def CheckDictionary(self, NewsWords): DbSQL = "SELECT Term, LENGTH(TERM) AS LTERM FROM chinese WHERE Term LIKE '%s%%' ORDER BY LTERM DESC" % (NewsWords) DbRows = self.DbConnection.execute(DbSQL) DbFound = False TermList = [] for DbRow in DbRows: TermList.append(DbRow[0]) DbFound = True DbRows = None return(DbFound, TermList) def ProcessNewsLine(self, NewsLine): NewsWords = NewsLine.split() NewsWordList = [] i = 0 while i < len(NewsWords): (DbFound, TermList) = self.CheckDictionary(NewsWords[i]) if DbFound == False: NewsWordList.append(NewsWords[i]) i += 1 continue Found = False for Term in TermList: j = 0 TermLength = 0 Words = [] while (TermLength < len(Term)) and (i+j < len(NewsWords)): TermLength += len(NewsWords[i + j]) if (TermLength > len(Term)) : break Words.append(NewsWords[i + j]) j += 1 if string.join(Words, '') == Term: NewsWordList.append(Term) i += j Found = True break if not Found: NewsWordList.append(NewsWords[i]) i += 1 return(NewsWordList) def doTask(self): self.DbConnection = sqlite3.connect(NewsDictionay) NewsFileHandle = codecs.open(NewsFile, "r", encoding="utf-8") for NewsParagraph in NewsFileHandle: NewsLineList = self.SeparateChineseDelimiters(NewsParagraph) for NewsLine in NewsLineList: TermList = self.ProcessNewsLine(NewsLine) for Term in TermList: print codecs.encode(Term, "utf-8"), '\t', print NewsFileHandle.close() self.DbConnection.close() if __name__ == "__main__": DataPath = "/home/richclee/Desktop/News" NewsPaper = "ChinaTimes" NewsThread = "0320" NewsDictDB = "Dictionaries/NewsDict.db" if len(sys.argv) > 1: NewsThread = sys.argv[1] if len(sys.argv) > 2: NewsPaper = sys.argv[2] NewsFile = "%s/%s/%s.txt" % (DataPath, NewsPaper, NewsThread) NewsDictionay = "%s/%s" % (DataPath, NewsDictDB) newsParser = NewsParser(DataPath, NewsPaper, NewsThread, NewsDictDB) newsParser.doTask()

字典 SQLite 資料結構