Save meanings from www.urduenglishdictionary.ord
import java.net.*;
import java.io.*;
public class UrduEnglishDictionary
{
static int i=0;
public static void main(String[] args) throws Exception
{// Run programm as JAVA UrduEnglishDictionary word page1 page2 outputfile.txt
// A=English Alphabet; P1=Page1; P2=Page2
// JAVA UrduEnglishDictionary B 1 5 b.txt
// Output is saved in out.txt
System.out.println(System.currentTimeMillis()/1000);
for(i=Integer.parseInt(args[1]);i<=Integer.parseInt(args[2]) ;i++){
new UrduEngDictThread("http://www.urduenglishdictionary.org/English-To-Urdu-Translation/"
+args[0]+"/Page-"+i+".htm").savePageExtractToFile(args[3]);
}
}//main
}//test2
//----------------------------------------------------------------------
//void savePageExtractToFile(); By default saves in out.txt
//void savePageExtractToFile("out.txt");
//----------------------------------------------------------------------
class UrduEngDictThread extends Thread
{
//public UrduEngDictThread(String URLString)
//public void run()
//public void savePageExtractToFile(String outFile)
//public static String getMeaningsFromURL(String HTMLSourceURL)
//public static String getMeanings(String HTMLSource)
String URLString;
//-----------------------------------------------------
public UrduEngDictThread(String URLString)
{
super(URLString);
this.URLString=URLString;
}
//-----------------------------------------------------
public void run()
{
try
{
savePageExtractToFile("out.txt");
}catch(Exception e){ }
}
//-----------------------------------------------------
public void savePageExtractToFile(String outFile)
//Uses getMeaningsFromURL(page URL): returns extracted meanings
//Method over ridden for specifying out file name
{
try
{
Writer out2 = new BufferedWriter(new OutputStreamWriter( new FileOutputStream(outFile,true), "UTF8"));
out2.write(getMeaningsFromURL(URLString));
out2.close();
System.out.println("Page:"+UrduEnglishDictionary.i+",Time:" +System.currentTimeMillis()/1000+" URL:"+URLString);
}catch(Exception e){ }
}
//-----------------------------------------------------
public static String getMeaningsFromURL(String HTMLSourceURL)
{
String inputLine="", outLine="";
try
{
//System.out.println(System.currentTimeMillis()+" Stream Opening");
URL dictURL = new URL(HTMLSourceURL);
BufferedReader dictURLIn = new BufferedReader(new InputStreamReader( dictURL.openStream() , "UTF-8" ));
//System.out.println(System.currentTimeMillis()+" Reading Started");
while ((inputLine = dictURLIn.readLine()) != null)
{
outLine+=inputLine;
}
//System.out.println(System.currentTimeMillis()+" Reading Complete");
dictURLIn.close(); //internet stream closed
//System.out.println(System.currentTimeMillis()+" Stream closed: ");
} catch(Exception e){}
return getMeanings(outLine);
}//getMeaningsFromURL
//-----------------------------------------------------
public static String getMeanings(String HTMLSource)
{
String outText="",tagStartText="",tagEndText="";
int i, tagStartPos=0, tagEndPos=0;
tagStartText="<td class='data-cell' align=left valign=top>";
i=tagStartText.length();
if (HTMLSource!=null)
{
while(HTMLSource.indexOf(tagStartText, tagEndPos)>0)
{
//Serial
//outText+=getFieldFromHTML(HTMLSource,"<td class='data-cell' align=left valign=top>","</td>", tagEndPos)+ "|";
tagStartText="<td class='data-cell' align=left valign=top>";
tagEndText="</td>";
i=tagStartText.length();
tagStartPos = HTMLSource.indexOf(tagStartText, tagEndPos);
tagEndPos = HTMLSource.indexOf(tagEndText,tagStartPos);
outText+=HTMLSource.substring(tagStartPos+i, tagEndPos)+"|";
//English Word
//outText+=getFieldFromHTML(HTMLSource,"<td class='data-cell' align=left valign=top nowrap>","<sup>", tagEndPos)+ "|";
tagStartText="<td class='data-cell' align=left valign=top nowrap>";
tagEndText="<sup>";
i=tagStartText.length();
tagStartPos = HTMLSource.indexOf(tagStartText,tagEndPos);
tagEndPos = HTMLSource.indexOf(tagEndText,tagStartPos);
outText+=HTMLSource.substring(tagStartPos+i, tagEndPos)+ "|";
//English Word Type
//outText+=getFieldFromHTML(HTMLSource,"<span class='feature'>","<BR>", tagEndPos)+ "|";
tagStartText="<span class='feature'>";
tagEndText="<BR>";
i=tagStartText.length();
tagStartPos = HTMLSource.indexOf(tagStartText,tagEndPos);
tagEndPos = HTMLSource.indexOf(tagEndText,tagStartPos);
outText+=HTMLSource.substring(tagStartPos+i, tagEndPos)+ "|";
//Urdu Word
//outText+=getFieldFromHTML(HTMLSource,"<td class='urdu-cell' align=right valign=top>","</td>", tagEndPos)+ "\r\n";
tagStartText="<td class='urdu-cell' align=right valign=top>";
tagEndText="</td>";
i=tagStartText.length();
tagStartPos = HTMLSource.indexOf(tagStartText,tagEndPos);
tagEndPos = HTMLSource.indexOf(tagEndText,tagStartPos);
outText+=HTMLSource.substring(tagStartPos+i, tagEndPos)+ "\r\n";
}//End for while
}//if HTMLSource=null
//if(outText.length()>1000)return "xxxxx"; else return outText;
return outText;
}//getMeanings
//-----------------------------------------------------
}//class