產生字詞頻度表

流程圖:

功能:

濾除標點符號/四則運算/括弧
濾除介係詞/助動詞
- 輸出 CSV 格式

程式碼:

#!/usr/bin/python import sys import nltk import re from operator import itemgetter class TextMining: def __init__(self, myText): self.myText = re.sub('[\'\"]', '', myText.lower()) self.myText = re.sub('[\\\\[\]\{\},.:;!?()\-+*/=><#]', ' ', self.myText) self.myTokens = [] self.myDictionary = {} def removeWords(self, unwantedList): myTokens = nltk.word_tokenize(self.myText) myPrep = unwantedList[0] myVerb = unwantedList[1] for myWord in myTokens: if not ((myWord in myPrep) or (myWord in myVerb) or (myWord.isdigit())) : self.myTokens.append(myWord) else: pass self.myText = ' '.join(self.myTokens) return len(self.myTokens) def buildDictionary(self): for myWord in self.myTokens: if self.myDictionary.has_key(myWord): self.myDictionary[myWord] = self.myDictionary[myWord] + 1 else: self.myDictionary[myWord] = 1 return len(self.myDictionary.keys()) def printOutDictionary(self, myDelimiter): keySet = sorted(self.myDictionary.keys(), key=itemgetter(0)) for myKey in keySet: print '"{0}"{1}{2}'.format(myKey, myDelimiter, self.myDictionary[myKey]) if __name__ == "__main__": if len(sys.argv) > 0: myPrep = ['the', 'a', 'an', 'or', 'and', 'of', 'with', 'without', 'over', 'by', 'in', 'on', 'upon', 'at', 'between', 'not'] myVerb = ['can', 'cannot', 'could', 'may', 'might', 'shall', 'should', 'will', 'would', 'am', 'are', 'is', 'was', 'were'] myUnwantedList = [myPrep, myVerb] myText = open(sys.argv[1], 'r').read() myTextMining = TextMining(myText) myWordCount = myTextMining.removeWords(myUnwantedList) myDictionaryCount = myTextMining.buildDictionary() myTextMining.printOutDictionary(',')

產生字詞頻度表

產生網路圖