Parsing HTML
The XML Service can be used to parse HTML. But it can be a bit cumbersome to navigate through the DOM tree.
In the examples below we will see how to make that easier with things like getElementById(), getElementsByClassName(), getElementsByTagName().
For example, with a few lines of code, you could grab the menu of a Wikipedia page to display it through an Apps Script web app.
function doGet() {
var html = UrlFetchApp.fetch('http://en.wikipedia.org/wiki/Document_Object_Model').getContentText();
var doc = XmlService.parse(html);
var html = doc.getRootElement();
var menu = getElementsByClassName(html, 'vertical-navbox nowraplinks')[0];
var output = XmlService.getRawFormat().format(menu);
return HtmlService.createHtmlOutput(output);
}
We fetch the HTML through UrlFetch
We use the XMLService to parse this HTML
Then we can use a specific function to grab the element we want in the DOM tree (like getElementsByClassName)
And we convert back this element to HTML
Or we could get all the links / anchors available in this menu and display them
function doGet() {
var html = UrlFetchApp.fetch('http://en.wikipedia.org/wiki/Document_Object_Model').getContentText();
var doc = XmlService.parse(html);
var html = doc.getRootElement();
var menu = getElementsByClassName(html, 'vertical-navbox nowraplinks')[0];
var output = '';
var linksInMenu = getElementsByTagName(menu, 'a');
for(i in linksInMenu) output+= XmlService.getRawFormat().format(linksInMenu[i])+'<br>';
return HtmlService.createHtmlOutput(output);
}
getElementById()
function getElementById(element, idToFind) {
var descendants = element.getDescendants();
for(i in descendants) {
var elt = descendants[i].asElement();
if( elt !=null) {
var id = elt.getAttribute('id');
if( id !=null && id.getValue()== idToFind) return elt;
}
}
}
getElementsByClassName()
function getElementsByClassName(element, classToFind) {
var data = [];
var descendants = element.getDescendants();
descendants.push(element);
for(i in descendants) {
var elt = descendants[i].asElement();
if(elt != null) {
var classes = elt.getAttribute('class');
if(classes != null) {
classes = classes.getValue();
if(classes == classToFind) data.push(elt);
else {
classes = classes.split(' ');
for(j in classes) {
if(classes[j] == classToFind) {
data.push(elt);
break;
}
}
}
}
}
}
return data;
}
getElementsByTagName()
function getElementsByTagName(element, tagName) {
var data = [];
var descendants = element.getDescendants();
for(i in descendants) {
var elt = descendants[i].asElement();
if( elt !=null && elt.getName()== tagName) data.push(elt);
}
return data;
}