July 26, 2014

How to determine Word Frequency in Java

?: Given a text file of arbitrary length, rank the words from most to least common in a file of arbitrary size.

A: Divide the file into individual words, put them into an RDBMS and use that to count. In the code below, I've chosen to use the embedded flavour of H2. Why H2? From its page:

The main features of H2 are:

Very fast, open source, JDBC API
Embedded and server modes; in-memory databases
Browser based Console application
Small footprint: around 1.5 MB jar file size
I've chosen Java to do this task. And I decided to not use any NLP library to handle word-segmentation out of a concern for disk space. I do believe h2 will write delimited data natively as well. Maybe for future improvement. But for now, the code reads:
 public static void main (String[] args) {
  try {
   Class.forName("org.h2.Driver").newInstance();
  } catch (ClassNotFoundException e) {
   e.printStackTrace(System.err);
   throw new RuntimeException(e.getMessage());
  } catch (InstantiationException e) {
   e.printStackTrace(System.err);
   throw new RuntimeException(e.getMessage());
  } catch (IllegalAccessException e) {
   e.printStackTrace(System.err);
   throw new RuntimeException(e.getMessage());
  }
  Long MAX_TRANSACTION_LENGTH = null;
  try {
   MAX_TRANSACTION_LENGTH = Long.parseLong(System.getProperty("max.transaction.length"));
  } catch (NumberFormatException e) {
   MAX_TRANSACTION_LENGTH = 40l;
  }
  Connection conn = null;
  try {
   conn = DriverManager.getConnection("jdbc:h2:alation;LOG=0;CACHE_SIZE=65536;LOCK_MODE=0;UNDO_LOG=0", "sa","");
   conn.setAutoCommit(false);
  } catch (SQLException e) {
   e.printStackTrace(System.err);
   throw new RuntimeException(e.getMessage());
  }
  String stringToExamine = "";
  try {
   conn.createStatement().execute("DROP TABLE IF EXISTS indexing;");
   conn.createStatement().execute("CREATE TABLE indexing (word VARCHAR UNIQUE, frequency INT)");
   conn.commit();
  } catch (SQLException e) {
   System.err.println("Creation failed -- "+e.getMessage());
   e.printStackTrace(System.err);
  }
  try {
   PreparedStatement insertStatement = conn.prepareStatement("INSERT INTO indexing (word, frequency) VALUES (LOWER(?),1)");
   int transactionLength = 0;
   Long insertStart = System.currentTimeMillis();
   BufferedReader reader = new BufferedReader(new FileReader(args[0]));
   String args_ = null;
   String line = null;
   while ((line = reader.readLine()) != null) {
    for (String word : line.split(" ")) {
     insertStatement.setString(1, word.replaceAll("\\p{Punct}",""));
     if (word.matches("^\\s+$")) {
      continue;
     }
     try {
      int inserted = insertStatement.executeUpdate();
     } catch (JdbcSQLException j) {
      PreparedStatement prepped = conn.prepareStatement("UPDATE indexing SET frequency = frequency + 1 WHERE word = ?", ResultSet.TYPE_SCROLL_SENSITIVE, ResultSet.CONCUR_UPDATABLE);
      prepped.setString(1, word);
      int updated = prepped.executeUpdate();
     }
     conn.commit();
    }
   }
   reader.close();
  } catch  (SQLException e) {
   System.err.println(e.getMessage());
   e.printStackTrace(System.err);
  } catch (IOException e) {
   System.err.println(e.getMessage());
   e.printStackTrace(System.err);
  } 
   
  try {
   ResultSet words = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY).executeQuery("SELECT word||'\t'||frequency as res FROM indexing ORDER BY frequency DESC, word");
   words.next();
    
   PrintStream out = null;
   try {
    out = new PrintStream(args[1]);
   } catch (Exception e) {
    out = new PrintStream(System.out);
   }
   Long start = System.currentTimeMillis();
   while (words.next()) {
    out.println(words.getString("res"));
   } 
   conn.close();
   System.err.println("\n\nProcessed in "+new Long(System.currentTimeMillis() - start)+" miliseconds.");
  } catch (SQLException e) {
   System.err.println(e.getMessage());
   e.printStackTrace(System.err);
  }
 }

No comments:

Post a Comment