LEARN BY EXAMPLE‎ > ‎

Parsing HTML

The XML Service can be used to parse HTML. But it can be a bit cumbersome to navigate through the DOM tree.
In the examples below we will see how to make that easier with things like getElementById(), getElementsByClassName(), getElementsByTagName().

For example, with a few lines of code, you could grab the menu of a Wikipedia page to display it through an Apps Script web app.

function doGet() {
  var html = UrlFetchApp.fetch('http://en.wikipedia.org/wiki/Document_Object_Model').getContentText();
  var doc = XmlService.parse(html);
  var html = doc.getRootElement();
  var menu = getElementsByClassName(html, 'vertical-navbox nowraplinks')[0];
  var output = XmlService.getRawFormat().format(menu);
  return HtmlService.createHtmlOutput(output);
}  

  1. We fetch the HTML through UrlFetch
  2. We use the XMLService to parse this HTML
  3. Then we can use a specific function to grab the element we want in the DOM tree (like getElementsByClassName)
  4. And we convert back this element to HTML 
Or we could get all the links / anchors available in this menu and display them
function doGet() {
  var html = UrlFetchApp.fetch('http://en.wikipedia.org/wiki/Document_Object_Model').getContentText();
  var doc = XmlService.parse(html);
  var html = doc.getRootElement();
  var menu = getElementsByClassName(html, 'vertical-navbox nowraplinks')[0];
  var output = '';
  var linksInMenu = getElementsByTagName(menu, 'a');
  for(i in linksInMenu) output+= XmlService.getRawFormat().format(linksInMenu[i])+'<br>';
  return HtmlService.createHtmlOutput(output);
}

getElementById()

function getElementById(element, idToFind) {  
  var descendants = element.getDescendants();  
  for(i in descendants) {
    var elt = descendants[i].asElement();
    if( elt !=null) {
      var id = elt.getAttribute('id');
      if( id !=null && id.getValue()== idToFind) return elt;    
    }
  }
}

getElementsByClassName()

function getElementsByClassName(element, classToFind) {  
  var data = [];
  var descendants = element.getDescendants();
  descendants.push(element);  
  for(i in descendants) {
    var elt = descendants[i].asElement();
    if(elt != null) {
      var classes = elt.getAttribute('class');
      if(classes != null) {
        classes = classes.getValue();
        if(classes == classToFind) data.push(elt);
        else {
          classes = classes.split(' ');
          for(j in classes) {
            if(classes[j] == classToFind) {
              data.push(elt);
              break;
            }
          }
        }
      }
    }
  }
  return data;
}

getElementsByTagName()

function getElementsByTagName(element, tagName) {  
  var data = [];
  var descendants = element.getDescendants();  
  for(i in descendants) {
    var elt = descendants[i].asElement();     
    if( elt !=null && elt.getName()== tagName) data.push(elt);      
  }
  return data;
}