Development Steps

The code below is only for demo version.

Parse text files into vocabulary vectors
Set-of-words model and bag-of-words model: We take the appearance of each word as a feature, which can be described as a set-of-words model. If a word appears more than once in a document, this may mean that the word contains a piece of information that cannot be expressed in the document. This method is called a bag-of-words model.
This is the code of a bag-of-words model, when one particular words show the counter of correspond words will plus one

def createVocabList(dataSet):

    vocabSet=set([])    # Crate a blank set

    for document in dataSet:

        vocabSet=vocabSet|set(document)   # Crate the Union of two sets

    return list(vocabSet)

def bagOfWords2VecMN(vocaList, inputSet):

    returnVec = [0] * len(vocabList)

    for word in inputSet:

        if word in vocaList:

            returnVec[vocabList.index(word)] += 1    # counter of words

    return returnVec

def textParse(bigString):

    import re

    listOfTokens=re.split(r'\W*',bigString)

    return [tok.lower() for tok in listOfTokens if len(tok)>2]

def trainNB0(trainMatrix, trainCategory):

"""

    :param trainMatrix:

    :param trainCategory:

    :return:

"""

    # total number of documents

    numTrainDocs = len(trainMatrix)

    # total number of word

    numWords = len(trainMatrix[0])

    # Probability of insulting documents

    pAbusive = sum(trainCategory) / float(numTrainDocs)

    # Construct a list of word occurrences

    # p0Num Normal statistics

    # p1Num Insulting statistics

    # Initialize the number of occurrences of each word to 1

    p0Num = ones(numWords)#[0,0......]->[1,1,1,1,1.....]

    p1Num = ones(numWords)

    # Initialization denominator is 2

    p0Denom = 2.0

    p1Denom = 2.0

    for i in range(numTrainDocs):

        if trainCategory[i] == 1:

            p1Num += trainMatrix[i]

            p1Denom += sum(trainMatrix[i])

        else:

            p0Num += trainMatrix[i]

            p0Denom += sum(trainMatrix[i])

    # Class 1

    p1Vect = log(p1Num / p1Denom)

    # Class 2

    p0Vect = log(p0Num / p0Denom)

    return p0Vect, p1Vect, pAbusive

Observe the error rate and make sure the classifier is available. The slicing program can be modified to reduce the error rate and improve the classification results.

def calcMostFreq(vocabList,fullText):

    import operator

    freqDict={}

    for token in vocabList:  #Traverse each word in the vocabulary

        freqDict[token]=fullText.count(token)  #Count the number of times each word appears in the text

    sortedFreq=sorted(freqDict.iteritems(),key=operator.itemgetter(1),reverse=True)  #sort dictionary

    return sortedFreq[:30]   #Return the top 30 words

def localWords(feed1,feed0):

    import feedparser

    docList=[];classList=[];fullText=[]

    minLen=min(len(feed1['entries']),len(feed0['entries']))

    for i in range(minLen):

        wordList=textParse(feed1['entries'][i]['summary'])   #Access each RSS feed

        docList.append(wordList)

        fullText.extend(wordList)

        classList.append(1)

        wordList=textParse(feed0['entries'][i]['summary'])

        docList.append(wordList)

        fullText.extend(wordList)

        classList.append(0)

    vocabList=createVocabList(docList)

    top30Words=calcMostFreq(vocabList,fullText)

    for pairW in top30Words:

        if pairW[0] in vocabList:vocabList.remove(pairW[0])    #Remove words that have the highest number of occurrences

    trainingSet=range(2*minLen);testSet=[]

    for i in range(20):

        randIndex=int(random.uniform(0,len(trainingSet)))

        testSet.append(trainingSet[randIndex])

        del(trainingSet[randIndex])

    trainMat=[];trainClasses=[]

    for docIndex in trainingSet:

        trainMat.append(bagOfWords2VecMN(vocabList,docList[docIndex]))

        trainClasses.append(classList[docIndex])

    p0V,p1V,pSpam=trainNBO(array(trainMat),array(trainClasses))

    errorCount=0

    for docIndex in testSet:

        wordVector=bagOfWords2VecMN(vocabList,docList[docIndex])

        if classifyNB(array(wordVector),p0V,p1V,pSpam)!=classList[docIndex]:

            errorCount+=1

    print 'the error rate is:',float(errorCount)/len(testSet)

    return vocabList,p0V,p1V

def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):

    p1=sum(vec2Classify*p1Vec)+log(pClass1)

    p0=sum(vec2Classify*p0Vec)+log(1.0-pClass1)

    if p1>p0:

        return 1

    else:

        return 0

def getTopWords(ny,sf):

    #import operator

    #vocabList,p0V,p1V=localWords(ny,sf)

    topNY=[];topSF=[]

    N = []; S = []

    for i in range(len(p0V)):

        if p0V[i]>-6.0:topSF.append((vocabList[i],p0V[i]))

        if p1V[i]>-6.0:topNY.append((vocabList[i],p1V[i]))

    sortedSF=sorted(topSF,key=lambda pair:pair[1],reverse=True)

    print ("SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**")

    for item in sortedSF:

        S.append(item[0])

        #print (item[0])

    print (S)

    sortedNY=sorted(topNY,key=lambda pair:pair[1],reverse=True)

    print ("NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**")

    for item in sortedNY:

        N.append(item[0])

        #print (item[0])

    print (N)

Google Sites

Report abuse