產生字詞間關係頻度表

功能:

  • 濾除 標點符號/四則運算/括弧
  • 濾除 介係詞/助動詞/所有格/關係代名詞
  • 輸出 CSV 格式

程式碼:

#!/usr/bin/python

import sys
import nltk
import re
from operator import itemgetter

class TextMining:
    def __init__(self, myText):
        self.myText = re.sub('[\'\"]', '', myText.lower())
        self.myText = re.sub('[\\\\[\]\{\},.:;!?()\-+*/=><#]', ' ', self.myText)
        self.myTokens = []
        self.myDictionary = {}
        self.myWordLinks = {}
        
    def removeWords(self, unwantedList):
        myTokens = nltk.word_tokenize(self.myText)
            
        for myWord in myTokens:
            if not ((myWord in unwantedList) or (myWord.isdigit())) :
                self.myTokens.append(myWord) 
            else:
                pass
                
        self.myText = ' '.join(self.myTokens)
        return len(self.myTokens)
        
    def buildDictionary(self):
        for myWord in self.myTokens:
            if self.myDictionary.has_key(myWord):
                self.myDictionary[myWord] = self.myDictionary[myWord] + 1
            else:
                self.myDictionary[myWord] = 1
                    
        return len(self.myDictionary.keys())
    
    def buildWordLinks(self):
        for i in range(len(self.myTokens) - 1):
            myWorkPair = self.myTokens[i] + '|' + self.myTokens[i+1]
            
            if self.myWordLinks.has_key(myWorkPair):
                self.myWordLinks[myWorkPair] = self.myWordLinks[myWorkPair] + 1
            else:
                self.myWordLinks[myWorkPair] = 1
                    
        return len(self.myWordLinks.keys())
    
    def printOutDictionary(self, myDelimiter):    
        keySet = sorted(self.myDictionary.keys(), key=itemgetter(0))
        for myKey in keySet:            
            print '"{0}"{1}{2}'.format(myKey, myDelimiter, self.myDictionary[myKey])

    def printOutWordLinks(self, myDelimiter):    
        keySet = sorted(self.myWordLinks.keys(), key=itemgetter(0))
        for myKey in keySet:
            myWordPairs = myKey.split('|')            
            print '"{0}"{1} "{2}"{3} {4}'.format(myWordPairs[0], myDelimiter, myWordPairs[1], myDelimiter, self.myWordLinks[myKey])

if __name__ == "__main__":
    if len(sys.argv) > 0:
        myPrep = ['the', 'a', 'an', 'or', 'and', 'of', 'with', 'without', 'over', 
                  'to', 'by', 'in', 'on', 'upon', 'at', 'between', 'not', 'for']
        myAuxiVerb = ['can', 'cannot', 'could', 'may', 'might', 'shall', 'should', 'will', 'would', 
                      'am', 'are', 'is', 'was', 'were', 'do', 'does', 'dosnt']
        myOthers1 = ['you', 'i', 'he', 'she', 'me', 'they', 'them', 'mine', 'his', 'hers', 
                    'my', 'which', 'where', 'what', 'how', 'when', 'this', 'that', 'it', 'its', 'each', 'every']
        myOthers2 = ['more', 'much', 'better', 'best', 'less', 'least', 'farther', 'further', 'very', 'poor']
        
        myUnwantedList = myPrep + myAuxiVerb + myOthers1 + myOthers2
            
        myText = open(sys.argv[1], 'r').read()
        
        myTextMining = TextMining(myText)
        myWordCount = myTextMining.removeWords(myUnwantedList)
        myDWordLinksCount = myTextMining.buildWordLinks()
        
        myTextMining.printOutWordLinks(',')
ċ
tm-word-net-count.py
(3k)
李智,
2013年3月14日 下午7:21