import res = "The quick brown fox jumps over the lazy dog"words = re.split(r"\s+",s) # \s denotes a white space character class# alternatively, words = s.split()print("Segmentation: ")print(words)Segmentation:
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
from nltk.stem.porter import *stemmer = PorterStemmer()plurals = ['caresses', 'flies', 'dies', 'mules', 'denied', 'died', 'agreed', 'owned', 'humbled', 'sized', 'meeting', 'stating', 'siezing', 'itemization', 'sensational', 'traditional', 'reference', 'colonizer', 'plotted']singles = [stemmer.stem(plural) for plural in plurals]print("Stemming: ")print(' '.join(singles))Stemming:
caress fli die mule deni die agre own humbl size meet state siez item sensat tradit refer colon plot
stopwords = {"and", "is", "will", "am", "are", "be", "a", "the"}proverb = "All work and no play makes Jack a dull boy, All play and no work makes Jack a mere toy."new_sentence = " ".join(filter(lambda word: not (word in stopwords), proverb.lower().split()))print("New Sentence: ")print(new_sentence)New Sentence:
all work no play makes jack dull boy, all play no work makes jack mere toy.
docA = "The cat sat on my face"docB = "The dog sat on my bed"bowA = docA.split(" ")bowB = docB.split(" ")print(bowA)print(bowB)['The', 'cat', 'sat', 'on', 'my', 'face']
['The', 'dog', 'sat', 'on', 'my', 'bed']
wordSet = set(bowA).union(set(bowB)) # Union return a set that contains all items from both sets, duplicates are excludedprint(wordSet){'on', 'face', 'The', 'my', 'dog', 'sat', 'bed', 'cat'}
wordDictA = dict.fromkeys(wordSet, 0)wordDictB = dict.fromkeys(wordSet, 0)print(wordDictA)print(wordDictB){'dog': 0, 'cat': 0, 'The': 0, 'on': 0, 'face': 0, 'bed': 0, 'my': 0, 'sat': 0}
{'dog': 0, 'cat': 0, 'The': 0, 'on': 0, 'face': 0, 'bed': 0, 'my': 0, 'sat': 0}
for word in bowA: wordDictA[word] += 1for word in bowB: wordDictB[word] += 1print(wordDictA)print(wordDictB){'on': 1, 'bed': 0, 'my': 1, 'dog': 0, 'cat': 1, 'face': 1, 'The': 1, 'sat': 1}
{'on': 1, 'bed': 1, 'my': 1, 'dog': 1, 'cat': 0, 'face': 0, 'The': 1, 'sat': 1}
import pandas as pdprint(pd.DataFrame([wordDictA, wordDictB]))cat dog my sat face The bed on
0 1 0 1 1 1 1 0 1
1 0 1 1 1 0 1 1 1
#TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document).def computeTF(wordDict, bow): tfDict = {} bowCount = len(bow) for word, count in wordDict.items(): tfDict[word] = count/float(bowCount) return tfDicttfBowA = computeTF(wordDictA, bowA)tfBowB = computeTF(wordDictB, bowB)print(tfBowA)print(tfBowB){'dog': 0.0, 'on': 0.16666666666666666, 'face': 0.16666666666666666, 'cat': 0.16666666666666666, 'sat': 0.16666666666666666, 'bed': 0.0, 'The': 0.16666666666666666, 'my': 0.16666666666666666}
{'dog': 0.16666666666666666, 'on': 0.16666666666666666, 'face': 0.0, 'cat': 0.0, 'sat': 0.16666666666666666, 'bed': 0.16666666666666666, 'The': 0.16666666666666666, 'my': 0.16666666666666666}
# IDF = log(number of documents)/(number of documents containing the word)def computeIDF(docList): import math idfDict = {} N = len(docList) print(docList[0]) print(docList[1]) idfDict = dict.fromkeys(docList[0].keys(), 0) print("Empty Dict") print(idfDict) for doc in docList: for word, val in doc.items(): if val > 0: idfDict[word] += 1 print("\nAfter merging") print(idfDict) for word, val in idfDict.items(): idfDict[word] = math.log10(N / float(val)) return idfDictidfs = computeIDF([wordDictA, wordDictB])print("\nIDF")print(idfs){'bed': 0, 'face': 1, 'dog': 0, 'cat': 1, 'The': 1, 'sat': 1, 'on': 1, 'my': 1}
{'bed': 1, 'face': 0, 'dog': 1, 'cat': 0, 'The': 1, 'sat': 1, 'on': 1, 'my': 1}
Empty Dict
{'bed': 0, 'face': 0, 'dog': 0, 'cat': 0, 'The': 0, 'sat': 0, 'on': 0, 'my': 0}
After merging
{'bed': 1, 'face': 1, 'dog': 1, 'cat': 1, 'The': 2, 'sat': 2, 'on': 2, 'my': 2}
IDF
{'bed': 0.3010299956639812, 'face': 0.3010299956639812, 'dog': 0.3010299956639812, 'cat': 0.3010299956639812, 'The': 0.0, 'sat': 0.0, 'on': 0.0, 'my': 0.0}
def computeTFIDF(tfBow, idfs): tfidf = {} for word, val in tfBow.items(): tfidf[word] = val*idfs[word] return tfidftfidfBowA = computeTFIDF(tfBowA, idfs)tfidfBowB = computeTFIDF(tfBowB, idfs)print(tfidfBowA)print(tfidfBowB){'The': 0.0, 'cat': 0.050171665943996864, 'sat': 0.0, 'bed': 0.0, 'dog': 0.0, 'on': 0.0, 'face': 0.050171665943996864, 'my': 0.0}
{'The': 0.0, 'cat': 0.0, 'sat': 0.0, 'bed': 0.050171665943996864, 'dog': 0.050171665943996864, 'on': 0.0, 'face': 0.0, 'my': 0.0}
import pandas as pdprint(pd.DataFrame([tfidfBowA, tfidfBowB]))sat The bed dog my on face cat
0 0.0 0.0 0.000000 0.000000 0.0 0.0 0.050172 0.050172
1 0.0 0.0 0.050172 0.050172 0.0 0.0 0.000000 0.000000
apple smart phoneandroid smart phonedef ngram(s, n): words = s.lower().split() output = [] for i in range(0, len(words)-n+1): output.append(words[i:(i+n)]) return list(map(lambda words:" ".join(words), output))print(ngram("A class is a blueprint for the object.", 2))['a class', 'class is', 'is a', 'a blueprint', 'blueprint for', 'for the', 'the object.']
print(ngram("A class is a blueprint for the object.", 3))['a class is', 'class is a', 'is a blueprint', 'a blueprint for', 'blueprint for the', 'for the object.']
import nltknltk.download('punkt')# nltk.download()from nltk.util import ngrams# Function to generate n-grams from sentences.def extract_ngrams(data, num): n_grams = ngrams(nltk.word_tokenize(data), num) return [' '.join(grams) for grams in n_grams]data = 'A class is a blueprint for the object.'print("1-gram: ", extract_ngrams(data, 1))print("2-gram: ", extract_ngrams(data, 2))print("3-gram: ", extract_ngrams(data, 3))print("4-gram: ", extract_ngrams(data, 4))1-gram: ['A', 'class', 'is', 'a', 'blueprint', 'for', 'the', 'object', '.']
2-gram: ['A class', 'class is', 'is a', 'a blueprint', 'blueprint for', 'for the', 'the object', 'object .']
3-gram: ['A class is', 'class is a', 'is a blueprint', 'a blueprint for', 'blueprint for the', 'for the object', 'the object .']
4-gram: ['A class is a', 'class is a blueprint', 'is a blueprint for', 'a blueprint for the', 'blueprint for the object', 'for the object .']