This demo use RSS feeds that from New York Times and Los Angels Times. In the training part, after get rid of repeated words and Stopwords, we use the top 30 words as effective words.
ny = feedparser.parse('https://nypost.com/living/feed/')sf = feedparser.parse('https://www.latimes.com/food/rss2.0.xml')runfile('C:/Users/Wruoc/Documents/test1.py', wdir='C:/Users/Wruoc/Documents')[('food', 9), ('year', 8), ('t', 8), ('time', 7), ('long', 6), ('years', 6), ('day', 5), ('island', 4), ('kitchen', 4), ('finally', 4), ('dinner', 4), ('shop', 4), ('lot', 4), ('folks', 4), ('cookbook', 4), ('night', 4), ('open', 3), ('thought', 3), ('decades', 3), ('mets', 3), ('culinary', 3), ('chef', 3), ('milk', 3), ('park', 3), ('days', 3), ('ago', 3), ('festival', 3), ('story', 3), ('charlie', 3), ('soup', 3)]minLen=min(len(feed1['entries']),len(feed0['entries']))print(minLen)runfile('C:/Users/Wruoc/Documents/test1.py', wdir='C:/Users/Wruoc/Documents')20For we can only get 20 RSS information from each RSS feed, we use 80%, which is 16, as training set,and 20%, which is 4 RSS as test feed.
for i in range(4): randIndex=int(random.uniform(0,len(trainingSet))) #print('sss',randIndex) testSet.append(trainingSet[randIndex]) #print(testSet) del(trainingSet[randIndex]) #print(trainingSet)trainMat = []trainClasses = []for docIndex in trainingSet: #print('docIndex',docIndex) trainMat.append(bagOfWords2VecMN(vocabList,docList[docIndex])) trainClasses.append(classList[docIndex])p0V,p1V,pSpam=trainNBO(np.array(trainMat),np.array(trainClasses))errorCount=0for docIndex in testSet: wordVector=bagOfWords2VecMN(vocabList,docList[docIndex]) if classifyNB(np.array(wordVector),p0V,p1V,pSpam)!=classList[docIndex]: errorCount+=1print('the error rate is:',float(errorCount)/len(testSet))
the error rate is: 0.2
From the test set, we can get the words list that link to Los angels and New York.
getTopWords(ny,la)LA**LA**LA**LA**LA**LA**LA**LA**LA**LA**LA**LA**LA**LA['food', 'long', 'dinner', 'shop', 'folks', 'cookbook', 'open', 'island', 'kitchen', 'culinary', 'chef', 'park', 'soup', 'hot', 'bar', 'lot', 'martha', 'years', 'baking', 'bowl', 'party', 'gold', 'small', 'chocolate', 'stewart', 'angeles', 'restaurant', 'recipes', 'http', 'href', 'sugar', 'retail', 'jars', 'potluck', 'target', 'day', 'www', 'menu', 'latest', 'milk', '_blank', 'days', 'ago', 'year', 'hip', 'cakes', 'finally', 'bread', 'los', 'vh', 'cooking', 'tran', 'chicken', 'time', 'classic', 'snoop', 'dessert', 'minded', 'good', 'bowls', 'night', 'july', 'equal', 'desserts', 'foraging', 'dinners', 'surfas', 'thought', 'cookies', 'theater', 'lens', 'gathers', 'slid', 'meal', 'matzo', 'bourdain', 'decades', 'pilgrimage', 'location', 'turning', 'burners', 'cabins', 'ma', 'sardonic', 'friendly', 'sheets', 'reopen', 'wooden', 'single', 'senior', 'vacations', 'snoops', 'beam', 'called', 'hippo', 'la', 'salad', 'diner', 'local', 'trucks', 'beach', 'obama', 'won', 'bookshelves', 'discs', 'school', 'gap', 'visited', 'kid', 'host', 'perry', 'helpful', 'raised', 'temperature', 'advisor', 'library', 'flu', 'beat', 'comfort', 'rapper', 'northern', 'collaboration', 'internet', 'people', 'loaded', 'watch', 'lit', 'fun', 'holidays', 'canadian', 'work', 'surprise', 'talking', 'baseball', 'tre', 'neighborhood', 'dusted', 'shops', 'shortly', 'restaurants', 'spend', 'bearer', 'homey', 'irvine', 'waiting', 'culver', 'nominated', 'center', 'juans', 'favorite', 'tortillas', 'triple', 'march', 'happened', 'tells', 'opened', 'grand', 'nestled', 'anthony', 'pancakes', 'woks', 'seattle', 'dish', 'yotam', 'box', 'brightly', 'saturday', 'gear', 'spin', 'commentator', 'ottolenghi', 'process', 'auctioneer', 'making', 'couple', 'loading', 'heritage', 'dogg', 'narrow', 'fo', 'ball', 'path', 'tiantian', 'backyard', 'lummi', 'hosting', 'story', 'barbecues', 'american', 'lakshmi', 'learned', 'market', 'diep', 'enormous', 'harris', 'waited', 'knee', 'geeta', 'batch', 'lives', 'case', 'pastries', 'instructor', 'fill', 'grandmother', 'cookbooks', 'games', 'sizes', 'emcee', 'director', 'spice', 'recipe', 'enclave', 'latimes', 'kitchens', 'standard', 'clay', 'girl', 'production', 'stove', 'tasting', 't', 'owners', 'reason', 'adam', 'confess', 'idea', 'seasonal', 'willows', 'book', 'entertaining', 'frosting', 'taquerias', 'stay', 'intellect', 'anytime', 'drinks', 'pasadena', 'creams', 'serving', 'inn', 'fruit', 'qiu', 'size', 'shone', 'coffee', 'navigates', 'mustard', 'dough', 'plumbing', 'review', 'jonathan', 'border', 'moving', 'core', 'summer', 'frosted', 'store', 'circles', 'billy', 'nutrition', 'tangle', 'vividly', 'pita', 'cold', 'remember', 'cure', 'ills', 've', 'revealed', 'distribution', 'freezer', 'live', 'pastry', 'dinette', 'monterey', 'batches', 'installed', 'friends', 'tea', 'tosi', 'cutting', 'accustomed', 'homestate', 'lever', 'cookie', 'closed', 'correspondent', 'padma', 'kids', 'passageways', 'deftly', 'ice', 'adjusts', 'district', 'owner', 'showcases', 'wonderland', 'site', 'stacked', 'bottles', 'pop', 'impresario', 'classroom', 'fairfax', 'places', 'containers', 'kind', 'letterpress', 'retro', 'indian', 'including', 'beverlywood', 'rolled', 'carts', 'cook', 'highland', 'lab', 'supply', 'vietnamese', 'doors', 'bean', 'oven', 'city', 'things', 'hand', 'whipping', 'lang', 'iteration', 'big', 'rounds', 'writer', 'administration', 'music', 'bansal', 'beans', 'roasters', 'stories', 'pies', 'christina', 'underneath', 'dawn', 'vendors', 'angelenos', 'stacks', 'trodden', 'html', 'cruise', 'born', 'parts', 'policy', 'green', 'lifetimes', 'san', 'frisbees']NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**['year', 't', 'day', 'festival', 'charlie', 'time', 'years', 'dress', 'burger', 'month', 'shafiroff', 'cecile', 'april', 'cathy', 'wait', 'helen', 'patricia', 'finally', 'story', 'husband', 'friday', 'lost', 'book', 'donald', 'winds', 'food', 'carnes', 'mother', 'campaign', 'trump', 'beef', 'miami', 'objectionable', 'things', 'friendship', 'big', 'night', 'eledge', 'carnivorous', 'curb', 'license', 'eating', 'juiciest', 'video', 'fools', 'king', 'creeps', 'lobby', 'earlier', 'children', 'thought', 'fire', 'nun', 'poet', 'reilly', 'caused', 'logistical', 'hate', 'eateries', 'winced', 'swims', 'decades', 'convertible', 'victim', 'field', 'harvard', 'upstate', 'age', 'distance', 'island', 'struggle', 'ack', 'strip', 'surrogate', 'greg', 'social', 'cheats', 'felon', 'ruling', 'connected', 'sophia', 'purging', 'blew', 'meaning', 'annual', 'meted', 'single', 'unsuspecting', 'fault', 'southeast', 'tidying', 'son', 'public', 'detritus', 'mercury', 'viral', 'hours', 'community', 'jean', 'buses', 'gp', 'posted', 'won', 'monday', 'sons', 'arms', 'nell', 'closer', 'drank', 'james', 'gofundme', 'clutter', 'shared', 'tavern', 'phelps', 'bat', 'landmark', 'sixteen', 'toll', 'convicted', 'sigh', 'dan', 'stanford', 'west', 'impossible', 'nostalgia', 'assured', 'attendees', 'deepest', 'sadistic', 'gabrielle', 'guisewite', 'writes', 'explanation', 'protection', 'people', 'biting', 'elliot', 'revoked', 'shoulders', 'putnam', 'office', 'tuesday', 'decided', 'hit', 'met', 'fun', 'event', 'virginia', 'cars', 'desires', 'bunch', 'baseball', 'beautifully', 'hardy', 'bottom', 'squared', 'dubbing', 'fiction', 'eats', 'drive', 'nicknamed', 'flashy', 'punisher', 'franchise', 'rick', 'brought', 'season', 'gray', 'mets', 'center', 'souls', 'famous', 'spring', 'clear', 'reconsider', 'sandwich', 'instated', 'watched', 'march', 'dread', 'phone', 'xy', 'deeper', 'creator', 'started', 'unconscious', 'dylan', 'commander', 'fraud', 'dmv', 'fyre', 'lip', 'throw', 'surprised', 'preservationists', 'prison', 'assailants', 'family', 'desktop', 'wanted', 'sister', 'touching', 'darien', 'hudson', 'served', 'grandchildren', 'beloved', 'broiled', 'susan', 'cult', 'length', 'weeks', 'days', 'investment', 'grief', 'picked', 'comic', 'ago', 'portland', 'inmate', 'making', 'york', 'steven', 'buzzy', 'powerhouse', 'shocking', 'purveyor', 'written', 'grown', 'yorkers', 'kondo', 'extreme', 'village', 'question', 'breathed', 'shut', 'beating', 'relief', 'college', 'sprung', 'beard', 'massachusetts', 'market', 'love', 'communications', 'foundation', 'league', 'purveyors', 'horse', 'fridays', 'hose', 'start', 'john', 'uma', 'blatant', 'chain', 'championships', 'director', 'resources', 'space', 'catherine', 'obstetrician', 'grandpa', 'seeking', 'agency', 'human', 'retired', 'croman', 'hordes', 'hamptons', 'ultra', 'program', 'hamburger', 'lured', 'firm', 'woke', 'nebraska', 'played', 'fifty', 'presidents', 'set', 'week', 'netflix', 'series', 'glam', 'drawers', 'black', 'emmy', 'filing', 'struck', 'granddaughter', 'golf', 'media', 'michael', 'whopper', 'fundraising', 'shelves', 'sold', 'chefs', 'tackles', 'customer', 'xx', 'understand', 'matter', 'moving', 'summer', 'dougherty', 'pizza', 'thomas', 'filled', 'throwing', 'grandmothers', 'vanity', 'matthew', 'aging', 'kick', 'world', 'receiving', 'thighs', 'couldn', 'reports', 'aptmetrics', 'marie', 'camera', 'beaten', 'knopf', 'vegetarian', 'spoke', 'pulled', 'century', 'brutal', 'amp', 'thursday', 'scene', 'st', 'cool', 'didn', 'suit', 'inseparable', 'kids', 'yankees', 'competition', 'thermometer', 'readers', 'customers', 'activist', 'nation', 'ended', 'dies', 'concertgoers', 'birth', 'committee', 'white', 'dear', 'celebration', 'meltdown', 'handle', 'transitions', 'plates', 'crime', 'dressed', 'religious', 'alternative', 'birthday', 'leaders', 'police', 'consulting', 'corporate', 'texts', 'taste', 'meditation', 'city', 'don', 'game', 'trans', 'flame', 'sinking', 'louise', 'version', 'music', 'freudenberger', 'gourmet', 'aggressively', 'shuttle', 'pink', 'casual', 'key', 'joke', 'stiff', 'citi', 'spoil', 'post', 'outfit', 'geographic', 'tax', 'boast', 'holding', 'organizers', 'cabinets', 'essays', 'socialite', 'repeatedly', 'illness', 'walsh', 'car']In this model the project extract the information from RSS feed of "Craigslist ". The project choose "sm biz ads" part of "Craigslist " in New York and San Francisco. Users could input the keywords of jobs and this model will return which area has a higher concern of this job.
Looking for a room ?
from craigslist import CraigslistHousingcl_h = CraigslistHousing(category='roo', filters={'max_price': 1200, 'private_room': True})for result in cl_h.get_results(sort_by='newest', geotagged=True): print result{ 'id': u'4851150747', 'name': u'Near SFSU, UCSF and NEWLY FURNISHED - CLEAN, CONVENIENT and CLEAN!', 'url': u'http://sfbay.craigslist.org/sfc/roo/4851150747.html', 'datetime': u'2015-01-27 23:44', 'price': u'$1100', 'where': u'inner sunset / UCSF', 'has_image': False, 'has_map': True, 'geotag': (37.738473, -122.494721)}# ...Maybe an engineering internship?
from craigslist import CraigslistJobscl_j = CraigslistJobs(category='eng', filters={'is_internship': True, 'employment_type': ['full-time', 'part-time']})for result in cl_j.get_results(): print result{ 'id': u'5708651182', 'name': u'GAME DEVELOPER INTERNSHIP AT TYNKER - AVAILABLE NOW!', 'url': u'http://sfbay.craigslist.org/pen/eng/5708651182.html', 'datetime': u'2016-07-30 13:30', 'price': None, 'where': u'mountain view', 'has_image': True, 'has_map': True, 'geotag': None}# ...Events with free food?
from craigslist import CraigslistEventscl_e = CraigslistEvents( filters={'free': True, 'food': True})for result in cl_e.get_results(sort_by='newest', limit=5): print result{ 'id': u'4866178242', 'name': u'Lituation Thursdays @ Le Reve', 'url': u'http://newyork.craigslist.org/mnh/eve/4866178242.html', 'datetime': u'1/29', 'price': None, 'where': u'Midtown East', 'has_image': True, 'has_map': True, 'geotag': None}# ...