from lxml import etreedoc = etree.parse("p5_addressbook.xml") # the XML file must be wellformedresults = doc.xpath("/addressbook/contact")print(results)[<Element contact at 0x1ce580b9b88>, <Element contact at 0x1ce580b9bc8>]
results = doc.xpath("/addressbook/contact")for e in results: print([c.text for c in e.getchildren()])#results = doc.xpath("//contact")#for e in results:# print([c.text for c in e.getchildren()])['Kenny', '65501696', 'kenny_lu@nyp.edu.sg']
['Charles', '95501551', 'charles_lee@nyp.edu.sg']
results = doc.xpath("/addressbook/contact")for e in results: name = e.xpath("name/text()") phone = e.xpath("phone/text()") email = e.xpath("email/text()") print("name = %s, phone = %s, email = %s" % (name[0], phone[0], email[0]))name = Kenny, phone = 65501696, email = kenny_lu@nyp.edu.sg
name = Charles, phone = 95501551, email = charles_lee@nyp.edu.sg
results = doc.xpath("//contact/phone[@type='mobile']")for e in results: print(e.text)95501551
from lxml import etreedef menu_xml_to_tsv(): menu_xml = etree.parse("p5_menu.xml") results = menu_xml.xpath("/breakfast_menu/food") print(results)[<Element food at 0x18f84efad88>, <Element food at 0x18f84f03088>, <Element food at 0x18f84f03308>, <Element food at 0x18f84f031c8>, <Element food at 0x18f84f03188>]
with open('p5_menu.tsv' ,'w') as menu_tsv: None for e in results: l = ([c.text for c in e.getchildren()]) print(l) s = "\t" line=s.join(l) print(line) menu_tsv.write(line+"\n") menu_tsv.close()menu_xml_to_tsv()['Belgian Waffles', '5.95', 'Two of our famous Belgian Waffles with plenty of real maple syrup', '650']
Belgian Waffles 5.95 Two of our famous Belgian Waffles with plenty of real maple syrup 650
['Strawberry Belgian Waffles', '7.95', 'Light Belgian waffles covered with strawberries and whipped cream', '900']
Strawberry Belgian Waffles 7.95 Light Belgian waffles covered with strawberries and whipped cream 900
['Berry-Berry Belgian Waffles', '8.95', 'Light Belgian waffles covered with an assortment of fresh berries and whipped cream', '900']
Berry-Berry Belgian Waffles 8.95 Light Belgian waffles covered with an assortment of fresh berries and whipped cream 900
['French Toast', '4.50', 'Thick slices made from our homemade sourdough bread', '600']
French Toast 4.50 Thick slices made from our homemade sourdough bread 600
['Homestyle Breakfast', '6.95', 'Two eggs, bacon or sausage, toast, and our ever-popular hash browns', '950']
Homestyle Breakfast 6.95 Two eggs, bacon or sausage, toast, and our ever-popular hash browns 950
from lxml import etreefrom lxml.cssselect import CSSSelectorhtmlparser = etree.HTMLParser()doc = etree.parse("p5_addressbook.html", htmlparser)div_selector = CSSSelector("div[class='mobile']") #to extract all the mobile numberfor e in div_selector(doc): print(e.text)95501551
from bs4 import BeautifulSoupimport csvsoup = BeautifulSoup (open("p5_addressbook.html"),"html.parser")links = soup.find_all('div', attrs={'class':'name'})for link in links: print(link) names = link.contents[0] print(names)<div class="name">Kenny</div>
Kenny
<div class="name">Charles</div>
Charles
import jsonf = open("p5_addressbook.json",'r')l = json.loads(f.read())f.close()print(l) # l is a list of dictionariesg = open("output.json", "w")g.write(json.dumps(l))g.close()[{'name': 'kenny', 'phone_type': 'office', 'phone_num': '65501696', 'email': 'kenny_lu@nyp.edu.sg'}, {'name': 'charles', 'phone_type': 'mobile', 'phone_num': '95501551', 'email': 'charles_lee@nyp.edu.sg'}]
import pandas as pddf = pd.read_json("p5_addressbook.json")print(df.head())name phone_type phone_num email
0 kenny office 65501696 kenny_lu@nyp.edu.sg
1 charles mobile 95501551 charles_lee@nyp.edu.sg