Parsing HTML

The XML Service can be used to parse HTML. But it can be a bit cumbersome to navigate through the DOM tree.

In the examples below we will see how to make that easier with things like getElementById(), getElementsByClassName(), getElementsByTagName().

For example, with a few lines of code, you could grab the menu of a Wikipedia page to display it through an Apps Script web app.

function doGet() {

var html = UrlFetchApp.fetch('http://en.wikipedia.org/wiki/Document_Object_Model').getContentText();

var doc = XmlService.parse(html);

var html = doc.getRootElement();

var menu = getElementsByClassName(html, 'vertical-navbox nowraplinks')[0];

var output = XmlService.getRawFormat().format(menu);

return HtmlService.createHtmlOutput(output);

}

    1. We fetch the HTML through UrlFetch

    2. We use the XMLService to parse this HTML

    3. Then we can use a specific function to grab the element we want in the DOM tree (like getElementsByClassName)

    4. And we convert back this element to HTML

Or we could get all the links / anchors available in this menu and display them

function doGet() {

var html = UrlFetchApp.fetch('http://en.wikipedia.org/wiki/Document_Object_Model').getContentText();

var doc = XmlService.parse(html);

var html = doc.getRootElement();

var menu = getElementsByClassName(html, 'vertical-navbox nowraplinks')[0];

var output = '';

var linksInMenu = getElementsByTagName(menu, 'a');

for(i in linksInMenu) output+= XmlService.getRawFormat().format(linksInMenu[i])+'<br>';

return HtmlService.createHtmlOutput(output);

}

getElementById()

function getElementById(element, idToFind) {

var descendants = element.getDescendants();

for(i in descendants) {

var elt = descendants[i].asElement();

if( elt !=null) {

var id = elt.getAttribute('id');

if( id !=null && id.getValue()== idToFind) return elt;

}

}

}

getElementsByClassName()

function getElementsByClassName(element, classToFind) {

var data = [];

var descendants = element.getDescendants();

descendants.push(element);

for(i in descendants) {

var elt = descendants[i].asElement();

if(elt != null) {

var classes = elt.getAttribute('class');

if(classes != null) {

classes = classes.getValue();

if(classes == classToFind) data.push(elt);

else {

classes = classes.split(' ');

for(j in classes) {

if(classes[j] == classToFind) {

data.push(elt);

break;

}

}

}

}

}

}

return data;

}

getElementsByTagName()

function getElementsByTagName(element, tagName) {

var data = [];

var descendants = element.getDescendants();

for(i in descendants) {

var elt = descendants[i].asElement();

if( elt !=null && elt.getName()== tagName) data.push(elt);

}

return data;

}