11. Text Preprocessing

Segmentation

Segmentation is to break paragraphs or sentences into semantic tokens

import re

s = "The quick brown fox jumps  over the lazy dog"

words = re.split(r"\s+",s)  # \s denotes a white space character class

# alternatively, words = s.split()

print("Segmentation: ")

print(words)

Segmentation:

['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

Stemming

Finding the root form of the words, e.g. "puts" and "put", "isolate" and "isolation"

from nltk.stem.porter import *

stemmer = PorterStemmer()

plurals = ['caresses', 'flies', 'dies', 'mules', 'denied',

    'died', 'agreed', 'owned', 'humbled', 'sized',

    'meeting', 'stating', 'siezing', 'itemization',

    'sensational', 'traditional', 'reference',

    'colonizer', 'plotted']

singles = [stemmer.stem(plural) for plural in plurals]

print("Stemming: ")

print(' '.join(singles))

Stemming:

caress fli die mule deni die agre own humbl size meet state siez item sensat tradit refer colon plot

Stop Words

Stop words are words that are commonly used in the text, such as "and", "is", "will" etc. which do not carry too much semantics role, and needed to be dropped before further processing

stopwords = {"and", "is", "will", "am", "are", "be", "a", "the"}

proverb = "All work and no play makes Jack a dull boy, All play and no work makes Jack a mere toy."

new_sentence = " ".join(filter(lambda word: not (word in stopwords), proverb.lower().split()))

print("New Sentence: ")

print(new_sentence)

New Sentence:

all work no play makes jack dull boy, all play no work makes jack mere toy.

Term Frequency - Inverse Document Frequency (TF-IDF)

To solve the manually maintaining the list of stop words issue
Term Frequency is the number of occurrences the term appears in a document
- TF = (Number of repetitions of word in a document) / (# of words in a document)
Inverse Document Frequency calculates importance as the inverse of the frequency of occurrence of a term in the corpus of documents
- IDF =Log[(Number of documents) / (Number of documents containing the word)]

docA = "The cat sat on my face"

docB = "The dog sat on my bed"

bowA = docA.split(" ")

bowB = docB.split(" ")

print(bowA)

print(bowB)

['The', 'cat', 'sat', 'on', 'my', 'face']

['The', 'dog', 'sat', 'on', 'my', 'bed']

wordSet = set(bowA).union(set(bowB)) # Union return a set that contains all items from both sets, duplicates are excluded

print(wordSet)

{'on', 'face', 'The', 'my', 'dog', 'sat', 'bed', 'cat'}

wordDictA = dict.fromkeys(wordSet, 0)

wordDictB = dict.fromkeys(wordSet, 0)

print(wordDictA)

print(wordDictB)

{'dog': 0, 'cat': 0, 'The': 0, 'on': 0, 'face': 0, 'bed': 0, 'my': 0, 'sat': 0}

for word in bowA:

    wordDictA[word] += 1

for word in bowB:

    wordDictB[word] += 1

print(wordDictA)

print(wordDictB)

{'on': 1, 'bed': 0, 'my': 1, 'dog': 0, 'cat': 1, 'face': 1, 'The': 1, 'sat': 1}

{'on': 1, 'bed': 1, 'my': 1, 'dog': 1, 'cat': 0, 'face': 0, 'The': 1, 'sat': 1}

import pandas as pd

print(pd.DataFrame([wordDictA, wordDictB]))

cat dog my sat face The bed on

0 1 0 1 1 1 1 0 1

1 0 1 1 1 0 1 1 1

Term Frequency

#TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document).

def computeTF(wordDict, bow):

    tfDict = {}

    bowCount = len(bow)

    for word, count in wordDict.items():

        tfDict[word] = count/float(bowCount)

    return tfDict

tfBowA = computeTF(wordDictA, bowA)

tfBowB = computeTF(wordDictB, bowB)

print(tfBowA)

print(tfBowB)

{'dog': 0.0, 'on': 0.16666666666666666, 'face': 0.16666666666666666, 'cat': 0.16666666666666666, 'sat': 0.16666666666666666, 'bed': 0.0, 'The': 0.16666666666666666, 'my': 0.16666666666666666}

{'dog': 0.16666666666666666, 'on': 0.16666666666666666, 'face': 0.0, 'cat': 0.0, 'sat': 0.16666666666666666, 'bed': 0.16666666666666666, 'The': 0.16666666666666666, 'my': 0.16666666666666666}

Inverse Document Frequency

# IDF = log(number of documents)/(number of documents containing the word)

def computeIDF(docList):

    import math

    idfDict = {}

    N = len(docList)

    print(docList[0])

    print(docList[1])

    idfDict = dict.fromkeys(docList[0].keys(), 0)

    print("Empty Dict")

    print(idfDict)

    for doc in docList:

        for word, val in doc.items():

            if val > 0:

                idfDict[word] += 1

    print("\nAfter merging")

    print(idfDict)

    for word, val in idfDict.items():

        idfDict[word] = math.log10(N / float(val))

    return idfDict

idfs = computeIDF([wordDictA, wordDictB])

print("\nIDF")

print(idfs)

{'bed': 0, 'face': 1, 'dog': 0, 'cat': 1, 'The': 1, 'sat': 1, 'on': 1, 'my': 1}

{'bed': 1, 'face': 0, 'dog': 1, 'cat': 0, 'The': 1, 'sat': 1, 'on': 1, 'my': 1}

Empty Dict

{'bed': 0, 'face': 0, 'dog': 0, 'cat': 0, 'The': 0, 'sat': 0, 'on': 0, 'my': 0}

After merging

{'bed': 1, 'face': 1, 'dog': 1, 'cat': 1, 'The': 2, 'sat': 2, 'on': 2, 'my': 2}

IDF

{'bed': 0.3010299956639812, 'face': 0.3010299956639812, 'dog': 0.3010299956639812, 'cat': 0.3010299956639812, 'The': 0.0, 'sat': 0.0, 'on': 0.0, 'my': 0.0}

Term Frequency - Inverse Document Frequency

def computeTFIDF(tfBow, idfs):

    tfidf = {}

    for word, val in tfBow.items():

        tfidf[word] = val*idfs[word]

    return tfidf

tfidfBowA = computeTFIDF(tfBowA, idfs)

tfidfBowB = computeTFIDF(tfBowB, idfs)

print(tfidfBowA)

print(tfidfBowB)

{'The': 0.0, 'cat': 0.050171665943996864, 'sat': 0.0, 'bed': 0.0, 'dog': 0.0, 'on': 0.0, 'face': 0.050171665943996864, 'my': 0.0}

{'The': 0.0, 'cat': 0.0, 'sat': 0.0, 'bed': 0.050171665943996864, 'dog': 0.050171665943996864, 'on': 0.0, 'face': 0.0, 'my': 0.0}

import pandas as pd

print(pd.DataFrame([tfidfBowA, tfidfBowB]))

sat The bed dog my on face cat

0 0.0 0.0 0.000000 0.000000 0.0 0.0 0.050172 0.050172

1 0.0 0.0 0.050172 0.050172 0.0 0.0 0.000000 0.000000

N-grams

Another type of features used in text analytics is n-gram. N-gram denotes the N consecutive terms in the text.
N grams of texts are basically a set of co-occurring words within a given window and when computing the n grams you typically move one word forward

apple smart phone

android smart phone

The 2-gram for the first document is:
- ["apple smart", "smart phone"]
- ["android smart", "smart phone"]

def ngram(s, n):

    words = s.lower().split()

    output = []

    for i in range(0, len(words)-n+1):

        output.append(words[i:(i+n)])

    return list(map(lambda words:" ".join(words), output))

print(ngram("A class is a blueprint for the object.", 2))

['a class', 'class is', 'is a', 'a blueprint', 'blueprint for', 'for the', 'the object.']

print(ngram("A class is a blueprint for the object.", 3))

['a class is', 'class is a', 'is a blueprint', 'a blueprint for', 'blueprint for the', 'for the object.']

N-grams by using Natural Language Toolkit (NLTK)

The Natural Language Toolkit (NLTK) is an open source Python library for Natural Language Processing

import nltk

nltk.download('punkt')

# nltk.download()

from nltk.util import ngrams

# Function to generate n-grams from sentences.

def extract_ngrams(data, num):

    n_grams = ngrams(nltk.word_tokenize(data), num)

    return [' '.join(grams) for grams in n_grams]

data = 'A class is a blueprint for the object.'

print("1-gram: ", extract_ngrams(data, 1))

print("2-gram: ", extract_ngrams(data, 2))

print("3-gram: ", extract_ngrams(data, 3))

print("4-gram: ", extract_ngrams(data, 4))

1-gram: ['A', 'class', 'is', 'a', 'blueprint', 'for', 'the', 'object', '.']

2-gram: ['A class', 'class is', 'is a', 'a blueprint', 'blueprint for', 'for the', 'the object', 'object .']

3-gram: ['A class is', 'class is a', 'is a blueprint', 'a blueprint for', 'blueprint for the', 'for the object', 'the object .']

4-gram: ['A class is a', 'class is a blueprint', 'is a blueprint for', 'a blueprint for the', 'blueprint for the object', 'for the object .']

Google Sites

Report abuse