25 July 2009

Lucene and MySQL

I read about the full text search provided by MySQL over the web and had to look for a better option for indexing my data which grows everyday. Sphinx looked a better choice but the Java api is not fully mature yet so I had to choose Lucene which takes little more time than Sphinx in displaying the search results but is far better than MySQL full text search, comparison is given here. In some cases Lucene outperforms other two. Other advantage of Lucene is its sub-projects like: Solr, Compass, Nutch etc.

Here is the sample code for indexing the MySQL data: (Please find the corrected code here for the method getSqlData())

 
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.TopDocCollector;
import java.sql.*;
import java.io.IOException;

/**
*
* @author viksin
*/
public class LuceneImp {

public void getSqlData() {
   String IndexStoreDir = "F:/LuceneIndex";
   ResultSet rs;
   StandardAnalyzer analyzer = new StandardAnalyzer();
   IndexWriter writer = null;

   try {
       String connUrl = "jdbc:mysql://localhost:3306/SQLDb"; 
        // put your mysql database name in place of SQLDb
       Connection con = DriverManager.getConnection(connUrl, "root", "SQLPassword");
       // put your mysql password in place of SQLPassword
       Statement stmt = con.createStatement();
       Class.forName("com.mysql.jdbc.Driver");
       stmt.executeQuery("select * from table");
       writer = new IndexWriter(IndexStoreDir, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
       rs = stmt.getResultSet();
       while (rs.next()) {
      
           addDoc(writer, "title", rs.getString("title"));
           addDoc(writer, "content", rs.getString("content"));

       }
       writer.close();
       rs.close();
       stmt.close();
   } catch (Exception ex) {
   }
}

private static void addDoc(IndexWriter w, String field, String value) throws IOException {
   Document doc = new Document();
   doc.add(new Field(field, value, Field.Store.YES, Field.Index.ANALYZED));
   w.addDocument(doc);
}
}

If you want to add a new sql row to an existing index replace true with false in the following line and run the above program.
 writer = new IndexWriter(IndexStoreDir, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);

You can put a column in the sql table (a flag) for checking and updating the indexing status of the row. In that case the you'll have to add following lines in the above code:
int indexStatus = 0;            
while (rs.next()) {
                indexStatus = rs.getInt("isindexed");
                if (indexStatus != 1) {
                    addDoc(writer, "title", rs.getString("title"));
                    addDoc(writer, "content", rs.getString("content"));
                    addDoc(writer, "link", rs.getString("link"));
                    addDoc(writer, "source", rs.getString("source"));
                    updateIndexStatus(rs.getString("id"));
                }

updateIndexStatus method will look like this:
            
    private static void updateIndexStatus(String Id) {
        try {
           String connUrl = "jdbc:mysql://localhost:3306/SQLDb"; 
           // put your mysql database name in place of SQLDb
           Connection con = DriverManager.getConnection(connUrl, "root", "SQLPassword");
           // put your mysql password in place of SQLPassword
            Statement stmt = con.createStatement();
            Class.forName("com.mysql.jdbc.Driver");
            stmt.executeQuery("update table set isindexed='1' where id="+Id+"");
            stmt.close();
        } catch (Exception ex) {
        }
    }

For searching your index this method can help:
 private static void searchIndex() {
      StandardAnalyzer analyzer = new StandardAnalyzer();
      String qu = "Java and Linux"; // put your keyword here
      String IndexStoreDir = "F:/LuceneIndex";
      try {
          Query q = new QueryParser("title", analyzer).parse(qu);
          int hitspp = 100; //hits per page
          IndexSearcher searcher = new IndexSearcher(IndexStoreDir);
          TopDocCollector collector = new TopDocCollector(hitspp);
          searcher.search(q, collector);
          ScoreDoc[] hits = collector.topDocs().scoreDocs;
          System.out.println("Found " + hits.length + " hits.");
          for (int i = 0; i < hits.length; ++i) {
                int docId = hits[i].doc;
                Document d = searcher.doc(docId);
                System.out.println((i + 1) + ". " + d.get("title"));
            }
            searcher.close();
        } catch (Exception ex1) {
        }
    }

Or you can use Luke for browsing index created by Lucene.