Application -- extract name and meaning from HTML tag
Post date: Jul 17, 2014 8:53:14 AM
Here is the code I used:
setwd
('~/research/thai_name/') ## ======================================================== ## PART 1 ## ======================================================== ## This program retrieve all urls to each Thai character ## ======================================================== ## Start with the main page con = url("http://thaigoodname.com") htmlCode <- readLines(con) close(con) str(htmlCode) htmlCode[1:20] ## Get all the urls I want T <- htmlCode #[2390:2400] r <- regexec(pattern = '^(<h2><center>)(.*)(</h2>)$', text = T) m <- regmatches(x = T, m = r) myText <- sapply(X = m, FUN = function(x) x[1]) content <- myText[!is.na(myText)] urls <- strsplit(x = content, split = '<a title=')[[1]] r <- regexec(pattern = '/[0-9]+/(.*)(.htm)', text = urls) S1 <- regmatches(x = urls, m = r) eachChar <- sapply(S1, function(x) x[1]) ## This is the url to each character eachChar <- eachChar[!is.na(eachChar)]
## ======================================================== ## PART 2 ## ======================================================== ## Get name and meaning for each letter ## ======================================================== getNameMeaning <- function(T) { r <- regexec(pattern = '^(<h2><a href=\"http://thaigoodname.com)(.*)(</h2>)$', text = T) m <- regmatches(x = T, m = r) myText <- sapply(X = m, FUN = function(x) x[1]) myText <- myText[!is.na(myText)] S <- myText S2 <- strsplit(x = S, split = "(http://thaigoodname.com/)|(.asp)") S3 <- sapply(X = S2, FUN = function(x) x[2]) S4 <- strsplit(x = S3, split = '/') Thai_name <- sapply(X = S4, FUN = function(x) x[1]) meaning <- sapply(X = S4, FUN = function(x) x[2]) df <- data.frame('name'=Thai_name, 'meaning' = meaning) return(df) } DF <- NULL for (i in 1:length(eachChar)) { # for (i in 1:3) { cat(sprintf('processing data %i from %i\n',i,length(eachChar))) myURL <- paste("http://thaigoodname.com",eachChar[i],sep='') con = url(myURL) htmlCode <- readLines(con) close(con) # str(htmlCode) # htmlCode[1:20] df <- getNameMeaning(htmlCode) DF <- rbind(DF,df) } # DF DF[,1] <- as.character(DF[,1]) DF[,2] <- as.character(DF[,2]) save(DF, file = 'thai_name_July162014.RData') write.table(x = DF, file = 'thai_name_July162014.csv', sep = ',', na = '', row.names = F)
and below is some test code:
DF[,1] <- as.character(DF[,1]) DF[,2] <- as.character(DF[,2]) grepl(pattern = 'ก', x = DF[1:5,1]) grep(pattern = 'กช', x = DF[5,1]) grep(pattern = 'กชน', x = DF[5,1]) grep(pattern = 'กชนุ', x = DF[5,1]) grep(pattern = 'กชนุท', x = DF[5,1]) grep(pattern = 'กชนุช', x = DF[5,1]) grepl(pattern = 'ุ', x = DF[1:5,1]) grepl(pattern = 'ู', x = DF[5,1])