產生字詞頻度表

流程圖:


功能:

  • 濾除 標點符號/四則運算/括弧
  • 濾除 介係詞/助動詞
  • 輸出 CSV 格式

程式碼:

#!/usr/bin/python

import sys
import nltk
import re
from operator import itemgetter

class TextMining:
    def __init__(self, myText):
        self.myText = re.sub('[\'\"]', '', myText.lower())
        self.myText = re.sub('[\\\\[\]\{\},.:;!?()\-+*/=><#]', ' ', self.myText)
        self.myTokens = []
        self.myDictionary = {}
        
    def removeWords(self, unwantedList):
        myTokens = nltk.word_tokenize(self.myText)
            
        myPrep = unwantedList[0]
        myVerb = unwantedList[1]
        
        for myWord in myTokens:
            if not ((myWord in myPrep) or (myWord in myVerb) or (myWord.isdigit())) :
                self.myTokens.append(myWord) 
            else:
                pass
                
        self.myText = ' '.join(self.myTokens)
        return len(self.myTokens)
        
    def buildDictionary(self):
        for myWord in self.myTokens:
            if self.myDictionary.has_key(myWord):
                self.myDictionary[myWord] = self.myDictionary[myWord] + 1
            else:
                self.myDictionary[myWord] = 1
                    
        return len(self.myDictionary.keys())
    
    def printOutDictionary(self, myDelimiter):    
        keySet = sorted(self.myDictionary.keys(), key=itemgetter(0))
        for myKey in keySet:            
            print '"{0}"{1}{2}'.format(myKey, myDelimiter, self.myDictionary[myKey])

if __name__ == "__main__":
    if len(sys.argv) > 0:
        myPrep = ['the', 'a', 'an', 'or', 'and', 'of', 'with', 'without', 'over', 'by', 'in', 'on', 'upon', 'at', 'between', 'not']
        myVerb = ['can', 'cannot', 'could', 'may', 'might', 'shall', 'should', 'will', 'would', 'am', 'are', 'is', 'was', 'were']
        myUnwantedList = [myPrep, myVerb]
            
        myText = open(sys.argv[1], 'r').read()
        
        myTextMining = TextMining(myText)
        myWordCount = myTextMining.removeWords(myUnwantedList)
        myDictionaryCount = myTextMining.buildDictionary()
        
        myTextMining.printOutDictionary(',')
ċ
Text-Mining-Flow.archimate
(27k)
李智,
2013年3月16日 上午1:35
ċ
tm_word_count.py
(2k)
李智,
2013年3月14日 下午7:23