import re
s = "The quick brown fox jumps over the lazy dog"
words = re.split(r"\s+",s) # \s denotes a white space character class
# alternatively, words = s.split()
print("Segmentation: ")
print(words)
Segmentation:
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
from nltk.stem.porter import *
stemmer = PorterStemmer()
plurals = ['caresses', 'flies', 'dies', 'mules', 'denied',
'died', 'agreed', 'owned', 'humbled', 'sized',
'meeting', 'stating', 'siezing', 'itemization',
'sensational', 'traditional', 'reference',
'colonizer', 'plotted']
singles = [stemmer.stem(plural) for plural in plurals]
print("Stemming: ")
print(' '.join(singles))
Stemming:
caress fli die mule deni die agre own humbl size meet state siez item sensat tradit refer colon plot
stopwords = {"and", "is", "will", "am", "are", "be", "a", "the"}
proverb = "All work and no play makes Jack a dull boy, All play and no work makes Jack a mere toy."
new_sentence = " ".join(filter(lambda word: not (word in stopwords), proverb.lower().split()))
print("New Sentence: ")
print(new_sentence)
New Sentence:
all work no play makes jack dull boy, all play no work makes jack mere toy.
docA = "The cat sat on my face"
docB = "The dog sat on my bed"
bowA = docA.split(" ")
bowB = docB.split(" ")
print(bowA)
print(bowB)
['The', 'cat', 'sat', 'on', 'my', 'face']
['The', 'dog', 'sat', 'on', 'my', 'bed']
wordSet = set(bowA).union(set(bowB)) # Union return a set that contains all items from both sets, duplicates are excluded
print(wordSet)
{'on', 'face', 'The', 'my', 'dog', 'sat', 'bed', 'cat'}
wordDictA = dict.fromkeys(wordSet, 0)
wordDictB = dict.fromkeys(wordSet, 0)
print(wordDictA)
print(wordDictB)
{'dog': 0, 'cat': 0, 'The': 0, 'on': 0, 'face': 0, 'bed': 0, 'my': 0, 'sat': 0}
{'dog': 0, 'cat': 0, 'The': 0, 'on': 0, 'face': 0, 'bed': 0, 'my': 0, 'sat': 0}
for word in bowA:
wordDictA[word] += 1
for word in bowB:
wordDictB[word] += 1
print(wordDictA)
print(wordDictB)
{'on': 1, 'bed': 0, 'my': 1, 'dog': 0, 'cat': 1, 'face': 1, 'The': 1, 'sat': 1}
{'on': 1, 'bed': 1, 'my': 1, 'dog': 1, 'cat': 0, 'face': 0, 'The': 1, 'sat': 1}
import pandas as pd
print(pd.DataFrame([wordDictA, wordDictB]))
cat dog my sat face The bed on
0 1 0 1 1 1 1 0 1
1 0 1 1 1 0 1 1 1
#TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document).
def computeTF(wordDict, bow):
tfDict = {}
bowCount = len(bow)
for word, count in wordDict.items():
tfDict[word] = count/float(bowCount)
return tfDict
tfBowA = computeTF(wordDictA, bowA)
tfBowB = computeTF(wordDictB, bowB)
print(tfBowA)
print(tfBowB)
{'dog': 0.0, 'on': 0.16666666666666666, 'face': 0.16666666666666666, 'cat': 0.16666666666666666, 'sat': 0.16666666666666666, 'bed': 0.0, 'The': 0.16666666666666666, 'my': 0.16666666666666666}
{'dog': 0.16666666666666666, 'on': 0.16666666666666666, 'face': 0.0, 'cat': 0.0, 'sat': 0.16666666666666666, 'bed': 0.16666666666666666, 'The': 0.16666666666666666, 'my': 0.16666666666666666}
# IDF = log(number of documents)/(number of documents containing the word)
def computeIDF(docList):
import math
idfDict = {}
N = len(docList)
print(docList[0])
print(docList[1])
idfDict = dict.fromkeys(docList[0].keys(), 0)
print("Empty Dict")
print(idfDict)
for doc in docList:
for word, val in doc.items():
if val > 0:
idfDict[word] += 1
print("\nAfter merging")
print(idfDict)
for word, val in idfDict.items():
idfDict[word] = math.log10(N / float(val))
return idfDict
idfs = computeIDF([wordDictA, wordDictB])
print("\nIDF")
print(idfs)
{'bed': 0, 'face': 1, 'dog': 0, 'cat': 1, 'The': 1, 'sat': 1, 'on': 1, 'my': 1}
{'bed': 1, 'face': 0, 'dog': 1, 'cat': 0, 'The': 1, 'sat': 1, 'on': 1, 'my': 1}
Empty Dict
{'bed': 0, 'face': 0, 'dog': 0, 'cat': 0, 'The': 0, 'sat': 0, 'on': 0, 'my': 0}
After merging
{'bed': 1, 'face': 1, 'dog': 1, 'cat': 1, 'The': 2, 'sat': 2, 'on': 2, 'my': 2}
IDF
{'bed': 0.3010299956639812, 'face': 0.3010299956639812, 'dog': 0.3010299956639812, 'cat': 0.3010299956639812, 'The': 0.0, 'sat': 0.0, 'on': 0.0, 'my': 0.0}
def computeTFIDF(tfBow, idfs):
tfidf = {}
for word, val in tfBow.items():
tfidf[word] = val*idfs[word]
return tfidf
tfidfBowA = computeTFIDF(tfBowA, idfs)
tfidfBowB = computeTFIDF(tfBowB, idfs)
print(tfidfBowA)
print(tfidfBowB)
{'The': 0.0, 'cat': 0.050171665943996864, 'sat': 0.0, 'bed': 0.0, 'dog': 0.0, 'on': 0.0, 'face': 0.050171665943996864, 'my': 0.0}
{'The': 0.0, 'cat': 0.0, 'sat': 0.0, 'bed': 0.050171665943996864, 'dog': 0.050171665943996864, 'on': 0.0, 'face': 0.0, 'my': 0.0}
import pandas as pd
print(pd.DataFrame([tfidfBowA, tfidfBowB]))
sat The bed dog my on face cat
0 0.0 0.0 0.000000 0.000000 0.0 0.0 0.050172 0.050172
1 0.0 0.0 0.050172 0.050172 0.0 0.0 0.000000 0.000000
apple smart phone
android smart phone
def ngram(s, n):
words = s.lower().split()
output = []
for i in range(0, len(words)-n+1):
output.append(words[i:(i+n)])
return list(map(lambda words:" ".join(words), output))
print(ngram("A class is a blueprint for the object.", 2))
['a class', 'class is', 'is a', 'a blueprint', 'blueprint for', 'for the', 'the object.']
print(ngram("A class is a blueprint for the object.", 3))
['a class is', 'class is a', 'is a blueprint', 'a blueprint for', 'blueprint for the', 'for the object.']
import nltk
nltk.download('punkt')
# nltk.download()
from nltk.util import ngrams
# Function to generate n-grams from sentences.
def extract_ngrams(data, num):
n_grams = ngrams(nltk.word_tokenize(data), num)
return [' '.join(grams) for grams in n_grams]
data = 'A class is a blueprint for the object.'
print("1-gram: ", extract_ngrams(data, 1))
print("2-gram: ", extract_ngrams(data, 2))
print("3-gram: ", extract_ngrams(data, 3))
print("4-gram: ", extract_ngrams(data, 4))
1-gram: ['A', 'class', 'is', 'a', 'blueprint', 'for', 'the', 'object', '.']
2-gram: ['A class', 'class is', 'is a', 'a blueprint', 'blueprint for', 'for the', 'the object', 'object .']
3-gram: ['A class is', 'class is a', 'is a blueprint', 'a blueprint for', 'blueprint for the', 'for the object', 'the object .']
4-gram: ['A class is a', 'class is a blueprint', 'is a blueprint for', 'a blueprint for the', 'blueprint for the object', 'for the object .']