產生字詞間關係頻度表

功能:

  • 濾除 標點符號/四則運算/括弧
  • 濾除 介係詞/助動詞/所有格/關係代名詞
    • 輸出 CSV 格式

程式碼:

#!/usr/bin/python import sys import nltk import re from operator import itemgetter class TextMining: def __init__(self, myText): self.myText = re.sub('[\'\"]', '', myText.lower()) self.myText = re.sub('[\\\\[\]\{\},.:;!?()\-+*/=><#]', ' ', self.myText) self.myTokens = [] self.myDictionary = {} self.myWordLinks = {} def removeWords(self, unwantedList): myTokens = nltk.word_tokenize(self.myText) for myWord in myTokens: if not ((myWord in unwantedList) or (myWord.isdigit())) : self.myTokens.append(myWord) else: pass self.myText = ' '.join(self.myTokens) return len(self.myTokens) def buildDictionary(self): for myWord in self.myTokens: if self.myDictionary.has_key(myWord): self.myDictionary[myWord] = self.myDictionary[myWord] + 1 else: self.myDictionary[myWord] = 1 return len(self.myDictionary.keys()) def buildWordLinks(self): for i in range(len(self.myTokens) - 1): myWorkPair = self.myTokens[i] + '|' + self.myTokens[i+1] if self.myWordLinks.has_key(myWorkPair): self.myWordLinks[myWorkPair] = self.myWordLinks[myWorkPair] + 1 else: self.myWordLinks[myWorkPair] = 1 return len(self.myWordLinks.keys()) def printOutDictionary(self, myDelimiter): keySet = sorted(self.myDictionary.keys(), key=itemgetter(0)) for myKey in keySet: print '"{0}"{1}{2}'.format(myKey, myDelimiter, self.myDictionary[myKey]) def printOutWordLinks(self, myDelimiter): keySet = sorted(self.myWordLinks.keys(), key=itemgetter(0)) for myKey in keySet: myWordPairs = myKey.split('|') print '"{0}"{1} "{2}"{3} {4}'.format(myWordPairs[0], myDelimiter, myWordPairs[1], myDelimiter, self.myWordLinks[myKey]) if __name__ == "__main__": if len(sys.argv) > 0: myPrep = ['the', 'a', 'an', 'or', 'and', 'of', 'with', 'without', 'over', 'to', 'by', 'in', 'on', 'upon', 'at', 'between', 'not', 'for'] myAuxiVerb = ['can', 'cannot', 'could', 'may', 'might', 'shall', 'should', 'will', 'would', 'am', 'are', 'is', 'was', 'were', 'do', 'does', 'dosnt'] myOthers1 = ['you', 'i', 'he', 'she', 'me', 'they', 'them', 'mine', 'his', 'hers', 'my', 'which', 'where', 'what', 'how', 'when', 'this', 'that', 'it', 'its', 'each', 'every'] myOthers2 = ['more', 'much', 'better', 'best', 'less', 'least', 'farther', 'further', 'very', 'poor'] myUnwantedList = myPrep + myAuxiVerb + myOthers1 + myOthers2 myText = open(sys.argv[1], 'r').read() myTextMining = TextMining(myText) myWordCount = myTextMining.removeWords(myUnwantedList) myDWordLinksCount = myTextMining.buildWordLinks() myTextMining.printOutWordLinks(',')