Finding and storing the multiplicity of words from a file
One needs:
MySQL database schema "NLP" with a table "words". The table has the following structure
string: varchar(45), nr: int(11), frequency: decimal(10,7)
- the column "string" contains the words discovered in the input file
- the column "nr" contains the number of times the word appears in the input file
- the column "frequency" specifies the weight of the word in the input file
In order to compile, do not forget to add "mysql-connector-java..." into the "Project properties/Libraries/Compile" (for NetBeans).
The input file contains a large text in a given language.
/***************************************************************************************/
//author: Dragos Sburlan
//description: the class handle a database connection
package nlp;
import java.sql.DriverManager;
import java.sql.Connection;
import java.sql.ResultSetMetaData;
import java.sql.Statement;
import java.sql.ResultSet;
import java.util.ArrayList;
public final class DBAccessController {
private Connection connection=null;
public DBAccessController(String url,String userId,String password)
{
try {
Class.forName("com.mysql.jdbc.Driver").newInstance();
connection= DriverManager.getConnection(url,userId,password);
}
catch(java.lang.ClassNotFoundException exceptionClassNotFound) {}
catch(java.lang.InstantiationException instantException) {}
catch(java.lang.IllegalAccessException illegalAccess) {}
catch(java.sql.SQLException sqle) {}
}
public final synchronized ArrayList runSQL(String queryString)
{
try {
Statement statement = connection.createStatement();
connection.setAutoCommit(true);
boolean flag = statement.execute(queryString);
if(flag)
{
ResultSet res=statement.getResultSet();
ResultSetMetaData rsmd=res.getMetaData();
int numberOfColumns = rsmd.getColumnCount();
ArrayList general=new ArrayList();
while (res.next())
{
ArrayList vect=new ArrayList(numberOfColumns);
for (int i=1;i<=numberOfColumns;i++)
{
Object o=res.getObject(i);
vect.add(o);
}
general.add(vect);
}
return general;
}
else return null;
}
catch (java.sql.SQLException sqle) {System.out.println(sqle.toString()); return null;}
}
public final void stop()
{
try
{
connection.close();
}
catch(java.sql.SQLException e){}
}
}
/***************************************************************************************/
//author: Dragos Sburlan
//description: the utility class WordsCounter is responsible of counting the occurrences of a word in a file
package nlp;
import java.io.*;
import java.util.*;
public class WordsCounter {
private int words_total_number = 0;
private HashMap<String,Integer> words_ht=new HashMap<>();
WordsCounter(String file_name)
{
String line = "";
try
{
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file_name), "UTF-16"));
while (true)
{
line=br.readLine();
if(line!=null)
{
String line_words[]=line.split("[^\\w]+");
for(int i=0;i<line_words.length;i++)
{
if(words_ht.get(line_words[i])!=null)
{
words_ht.put(line_words[i],words_ht.get(line_words[i])+1);
}
else
{
words_ht.put(line_words[i],1);
}
this.words_total_number++;
}
}
else break;
}
}
catch (IOException e){System.out.println(e.toString());}
}
int getWordsNumber(){return this.words_total_number;}
HashMap <String,Integer> getWordsHashmap(){return this.words_ht;}
}
/***************************************************************************************/
//author: Dragos Sburlan
package nlp;
import java.util.*;
import java.io.*;
public class NLP {
public static void main(String[] args) {
WordsCounter wc = new WordsCounter("c:\\NLP\\input_file.txt");
HashMap<String,Integer> words_hm=wc.getWordsHashmap();
//reading from hashtable and printing output_tmp.txt
//do this for speeding up (instead of adding each record from the hashmap
//we build a string/file containing the data and we load the file as a big chunk
Set<String> set = words_hm.keySet();
Iterator<String> it =set.iterator();
StringBuilder sb=new StringBuilder();
while (it.hasNext())
{
String word_tmp = it.next();
int nr=words_hm.get(word_tmp);
double frequency=Math.log(((double) nr)/wc.getWordsNumber());
sb.append(word_tmp).append(", ").append(nr).append(", ").append(frequency).append("\r\n");
}
try
{
BufferedWriter out = new BufferedWriter (new FileWriter ("c:\\NLP\\output_tmp.txt"));
out.write(sb.toString());
out.close();
}
catch(Exception exc){System.err.println("Error: "+exc.getMessage());}
String querydb = "LOAD DATA LOCAL INFILE '/NLP/output_tmp.txt' INTO TABLE words FIELDS TERMINATED BY ',' LINES TERMINATED BY '\r\n'";
//connecting to DB and loading the file
String address="jdbc:mysql://localhost/NLP";
String user="root";
String pass="root";
DBAccessController dba=new DBAccessController(address,user, pass);
dba.runSQL(querydb);
//getting a record for testing
String request="select * from words where string='the'";
ArrayList result=dba.runSQL(request);
ArrayList first_record=(ArrayList)result.get(0);
System.out.println(first_record.get(0));
dba.stop();
}
}