def calcMostFreq(vocabList,fullText):
import operator
freqDict={}
for token in vocabList: #Traverse each word in the vocabulary
freqDict[token]=fullText.count(token) #Count the number of times each word appears in the text
sortedFreq=sorted(freqDict.iteritems(),key=operator.itemgetter(1),reverse=True) #sort dictionary
return sortedFreq[:30] #Return the top 30 words
def localWords(feed1,feed0):
import feedparser
docList=[];classList=[];fullText=[]
minLen=min(len(feed1['entries']),len(feed0['entries']))
for i in range(minLen):
wordList=textParse(feed1['entries'][i]['summary']) #Access each RSS feed
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList=textParse(feed0['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList=createVocabList(docList)
top30Words=calcMostFreq(vocabList,fullText)
for pairW in top30Words:
if pairW[0] in vocabList:vocabList.remove(pairW[0]) #Remove words that have the highest number of occurrences
trainingSet=range(2*minLen);testSet=[]
for i in range(20):
randIndex=int(random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
trainMat=[];trainClasses=[]
for docIndex in trainingSet:
trainMat.append(bagOfWords2VecMN(vocabList,docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam=trainNBO(array(trainMat),array(trainClasses))
errorCount=0
for docIndex in testSet:
wordVector=bagOfWords2VecMN(vocabList,docList[docIndex])
if classifyNB(array(wordVector),p0V,p1V,pSpam)!=classList[docIndex]:
errorCount+=1
print 'the error rate is:',float(errorCount)/len(testSet)
return vocabList,p0V,p1V
def getTopWords(ny,sf):
#import operator
#vocabList,p0V,p1V=localWords(ny,sf)
topNY=[];topSF=[]
N = []; S = []
for i in range(len(p0V)):
if p0V[i]>-6.0:topSF.append((vocabList[i],p0V[i]))
if p1V[i]>-6.0:topNY.append((vocabList[i],p1V[i]))
sortedSF=sorted(topSF,key=lambda pair:pair[1],reverse=True)
print ("SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**")
for item in sortedSF:
S.append(item[0])
#print (item[0])
print (S)
sortedNY=sorted(topNY,key=lambda pair:pair[1],reverse=True)
print ("NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**")
for item in sortedNY:
N.append(item[0])
#print (item[0])
print (N)