中文分詞

新聞體中文分詞

#!/usr/bin/python
# -*- coding: utf-8 -*-

import sys
import codecs
import string
import re
import sqlite3

class NewsParser():
    DataPath = ""
    NewsPaper = ""
    NewsThread = ""
    NewsDictDB = ""
    DbConnection = None
    
    def __init__(self, DataPath, NewsPaper, NewsThread, NewsDictDB):
        self.DataPath = DataPath
        self.NewsPaper = NewsPaper
        self.NewsThread = NewsThread
        self.NewsDictDB = NewsDictDB
        
    def SeparateChineseDelimiters(self, ChineseWords):
        ChineseLines = []
        WordList1 = ChineseWords.split(u"。")
        for Words1 in WordList1:
            WordList2 = Words1.split(u"?")
            for Words2 in WordList2:
                WordList3 = Words2.split(u"!")
                for Words3 in WordList3:
                   WordList4 = Words3.split(u",")
                   for Words4 in WordList4:
                       WordList5 = Words4.split(u"、")
                       for Words5 in WordList5:
                           WordList6 = Words5.split(u":")
                           for Words6 in WordList6:
                               ChineseLines.append(Words6)
                  
        return(ChineseLines)
     
    def SeparateEnglishDelimiters(self, EnglishWords):
        EnglishLines = []
        WordList = re.split("[.|,|?|!]", EnglishWords)
        for Words in WordList:
           EnglishLines.append(Words)
                  
        return(EnglishLines)
        
    def CheckDictionary(self, NewsWords):
        DbSQL = "SELECT Term, LENGTH(TERM) AS LTERM FROM chinese WHERE Term LIKE '%s%%' ORDER BY LTERM DESC" % (NewsWords)
        DbRows = self.DbConnection.execute(DbSQL)
    
        DbFound = False
        TermList = []
        for DbRow in DbRows:
            TermList.append(DbRow[0])
            DbFound = True
            
        DbRows = None    
        return(DbFound, TermList) 
        
    def ProcessNewsLine(self, NewsLine):
        NewsWords = NewsLine.split()
        NewsWordList = []
        i = 0
        while i < len(NewsWords):
            (DbFound, TermList)  =  self.CheckDictionary(NewsWords[i])
            if DbFound == False:
                 NewsWordList.append(NewsWords[i])
                 i += 1
                 continue
    
            Found = False
            for Term in TermList:
                j = 0
                TermLength = 0
                Words = []
                while (TermLength < len(Term)) and (i+j < len(NewsWords)):
                    TermLength += len(NewsWords[i + j])
                    if (TermLength > len(Term)) :
                        break
                    
                    Words.append(NewsWords[i + j])                
                    j += 1
    
                if string.join(Words, '') == Term:
                    NewsWordList.append(Term)
                    i += j
                    Found = True
                    break
            
            if not Found:
                NewsWordList.append(NewsWords[i])
                i += 1
                
        return(NewsWordList) 
    
    def doTask(self):
        self.DbConnection = sqlite3.connect(NewsDictionay)
        NewsFileHandle = codecs.open(NewsFile, "r", encoding="utf-8")
        
        for NewsParagraph in NewsFileHandle:
            NewsLineList =  self.SeparateChineseDelimiters(NewsParagraph)
            for NewsLine in NewsLineList:
                TermList = self.ProcessNewsLine(NewsLine)
                for Term in TermList:
                     print codecs.encode(Term, "utf-8"), '\t',
                
                print
    
        NewsFileHandle.close()
        self.DbConnection.close()    

if __name__ == "__main__":
    DataPath = "/home/richclee/Desktop/News"
    NewsPaper = "ChinaTimes"
    NewsThread = "0320"
    NewsDictDB = "Dictionaries/NewsDict.db"
    
    if len(sys.argv) > 1:
        NewsThread = sys.argv[1]
        
    if len(sys.argv) > 2:
        NewsPaper = sys.argv[2]
        
    NewsFile = "%s/%s/%s.txt" % (DataPath, NewsPaper, NewsThread)
    NewsDictionay = "%s/%s" % (DataPath, NewsDictDB)
    
    newsParser = NewsParser(DataPath, NewsPaper, NewsThread, NewsDictDB)    
    newsParser.doTask()

字典 SQLite 資料結構

ċ
0319.txt
(6k)
李智,
2014年6月1日 下午1:44
ċ
NewsDict.db
(17k)
李智,
2014年6月1日 下午8:11
ċ
NewsParser.py
(4k)
李智,
2014年6月1日 下午8:11