from lxml import etree
doc = etree.parse("p5_addressbook.xml") # the XML file must be wellformed
results = doc.xpath("/addressbook/contact")
print(results)
[<Element contact at 0x1ce580b9b88>, <Element contact at 0x1ce580b9bc8>]
results = doc.xpath("/addressbook/contact")
for e in results:
print([c.text for c in e.getchildren()])
#results = doc.xpath("//contact")
#for e in results:
# print([c.text for c in e.getchildren()])
['Kenny', '65501696', 'kenny_lu@nyp.edu.sg']
['Charles', '95501551', 'charles_lee@nyp.edu.sg']
results = doc.xpath("/addressbook/contact")
for e in results:
name = e.xpath("name/text()")
phone = e.xpath("phone/text()")
email = e.xpath("email/text()")
print("name = %s, phone = %s, email = %s" % (name[0], phone[0], email[0]))
name = Kenny, phone = 65501696, email = kenny_lu@nyp.edu.sg
name = Charles, phone = 95501551, email = charles_lee@nyp.edu.sg
results = doc.xpath("//contact/phone[@type='mobile']")
for e in results:
print(e.text)
95501551
from lxml import etree
def menu_xml_to_tsv():
menu_xml = etree.parse("p5_menu.xml")
results = menu_xml.xpath("/breakfast_menu/food")
print(results)
[<Element food at 0x18f84efad88>, <Element food at 0x18f84f03088>, <Element food at 0x18f84f03308>, <Element food at 0x18f84f031c8>, <Element food at 0x18f84f03188>]
with open('p5_menu.tsv' ,'w') as menu_tsv:
None
for e in results:
l = ([c.text for c in e.getchildren()])
print(l)
s = "\t"
line=s.join(l)
print(line)
menu_tsv.write(line+"\n")
menu_tsv.close()
menu_xml_to_tsv()
['Belgian Waffles', '5.95', 'Two of our famous Belgian Waffles with plenty of real maple syrup', '650']
Belgian Waffles 5.95 Two of our famous Belgian Waffles with plenty of real maple syrup 650
['Strawberry Belgian Waffles', '7.95', 'Light Belgian waffles covered with strawberries and whipped cream', '900']
Strawberry Belgian Waffles 7.95 Light Belgian waffles covered with strawberries and whipped cream 900
['Berry-Berry Belgian Waffles', '8.95', 'Light Belgian waffles covered with an assortment of fresh berries and whipped cream', '900']
Berry-Berry Belgian Waffles 8.95 Light Belgian waffles covered with an assortment of fresh berries and whipped cream 900
['French Toast', '4.50', 'Thick slices made from our homemade sourdough bread', '600']
French Toast 4.50 Thick slices made from our homemade sourdough bread 600
['Homestyle Breakfast', '6.95', 'Two eggs, bacon or sausage, toast, and our ever-popular hash browns', '950']
Homestyle Breakfast 6.95 Two eggs, bacon or sausage, toast, and our ever-popular hash browns 950
from lxml import etree
from lxml.cssselect import CSSSelector
htmlparser = etree.HTMLParser()
doc = etree.parse("p5_addressbook.html", htmlparser)
div_selector = CSSSelector("div[class='mobile']") #to extract all the mobile number
for e in div_selector(doc):
print(e.text)
95501551
from bs4 import BeautifulSoup
import csv
soup = BeautifulSoup (open("p5_addressbook.html"),"html.parser")
links = soup.find_all('div', attrs={'class':'name'})
for link in links:
print(link)
names = link.contents[0]
print(names)
<div class="name">Kenny</div>
Kenny
<div class="name">Charles</div>
Charles
import json
f = open("p5_addressbook.json",'r')
l = json.loads(f.read())
f.close()
print(l) # l is a list of dictionaries
g = open("output.json", "w")
g.write(json.dumps(l))
g.close()
[{'name': 'kenny', 'phone_type': 'office', 'phone_num': '65501696', 'email': 'kenny_lu@nyp.edu.sg'}, {'name': 'charles', 'phone_type': 'mobile', 'phone_num': '95501551', 'email': 'charles_lee@nyp.edu.sg'}]
import pandas as pd
df = pd.read_json("p5_addressbook.json")
print(df.head())
name phone_type phone_num email
0 kenny office 65501696 kenny_lu@nyp.edu.sg
1 charles mobile 95501551 charles_lee@nyp.edu.sg