Application -- extract name and meaning from HTML tag

Post date: Jul 17, 2014 8:53:14 AM

Here is the code I used:

setwd('~/research/thai_name/') ## ======================================================== ##                     PART 1 ## ======================================================== ## This program retrieve all urls to each Thai character ## ======================================================== ## Start with the main page con = url("http://thaigoodname.com") htmlCode <- readLines(con) close(con) str(htmlCode) htmlCode[1:20]   ## Get all the urls I want T <- htmlCode #[2390:2400] r <- regexec(pattern = '^(<h2><center>)(.*)(</h2>)$', text = T) m <- regmatches(x = T, m = r) myText <- sapply(X = m, FUN = function(x) x[1]) content <- myText[!is.na(myText)] urls <- strsplit(x = content, split = '<a title=')[[1]] r <- regexec(pattern = '/[0-9]+/(.*)(.htm)', text = urls) S1 <- regmatches(x = urls, m = r) eachChar <- sapply(S1, function(x) x[1])   ## This is the url to each character eachChar <- eachChar[!is.na(eachChar)]  
## ======================================================== ##                     PART 2 ## ======================================================== ## Get name and meaning for each letter ## ======================================================== getNameMeaning <- function(T) {         r <- regexec(pattern = '^(<h2><a href=\"http://thaigoodname.com)(.*)(</h2>)$', text = T)         m <- regmatches(x = T, m = r)         myText <- sapply(X = m, FUN = function(x) x[1])         myText <- myText[!is.na(myText)]         S <- myText         S2 <- strsplit(x = S, split = "(http://thaigoodname.com/)|(.asp)")         S3 <- sapply(X = S2, FUN = function(x) x[2])         S4 <- strsplit(x = S3, split = '/')         Thai_name <- sapply(X = S4, FUN = function(x) x[1])         meaning <- sapply(X = S4, FUN = function(x) x[2])         df <- data.frame('name'=Thai_name, 'meaning' = meaning)         return(df) }   DF <- NULL for (i in 1:length(eachChar)) {         # for (i in 1:3) {         cat(sprintf('processing data %i from %i\n',i,length(eachChar)))         myURL <- paste("http://thaigoodname.com",eachChar[i],sep='')         con = url(myURL)         htmlCode <- readLines(con)         close(con)         # str(htmlCode)         # htmlCode[1:20]         df <- getNameMeaning(htmlCode)         DF <- rbind(DF,df) }   # DF DF[,1] <- as.character(DF[,1]) DF[,2] <- as.character(DF[,2])   save(DF, file = 'thai_name_July162014.RData') write.table(x = DF, file = 'thai_name_July162014.csv', sep = ',', na = '', row.names = F)

and below is some test code:

DF[,1] <- as.character(DF[,1]) DF[,2] <- as.character(DF[,2])   grepl(pattern = 'ก', x = DF[1:5,1]) grep(pattern = 'กช', x = DF[5,1]) grep(pattern = 'กชน', x = DF[5,1]) grep(pattern = 'กชนุ', x = DF[5,1]) grep(pattern = 'กชนุท', x = DF[5,1]) grep(pattern = 'กชนุช', x = DF[5,1]) grepl(pattern = 'ุ', x = DF[1:5,1]) grepl(pattern = 'ู', x = DF[5,1])