Sunday, December 11, 2011

Apache Lucene POC with code and explanation


Lucene is an open source, highly scalable text search-engine library available from the Apache Software Foundation. You can use Lucene in commercial and open source applications. Lucene's powerful APIs focus mainly on text indexing and searching. It can be used to build search capabilities for applications such as e-mail clients, mailing lists, Web searches, database search, etc. Web sites like Wikipedia, TheServerSide, jGuru, and LinkedIn have been powered by Lucene.
Lucene has many features. It:
  • Has powerful, accurate, and efficient search algorithms.
  • Calculates a score for each document that matches a given query and returns the most relevant documents ranked by the scores.
  • Supports many powerful query types, such as PhraseQuery, WildcardQuery, RangeQuery, FuzzyQuery, BooleanQuery, and more.
  • Supports parsing of human-entered rich query expressions.
  • Allows users to extend the searching behavior using custom sorting, filtering, and query expression parsing.
  • Uses a file-based locking mechanism to prevent concurrent index modifications.
  • Allows searching and indexing simultaneously.

LuceneIndexWriter Class
package pocUsingLucene;

import java.io.File;
import java.io.IOException;

import org.apache.log4j.Logger;
import org.apache.lucene.analysis.LimitTokenCountAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LogMergePolicy;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SerialMergeScheduler;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.lucene.util.Version;

/**
 * @author glakshmanan
 *
 */
public class LuceneIndexWriter {

// Logger
public static final Logger log = Logger.getLogger(LuceneIndexWriter.class);

// IndexWriting Directory
private static final String INDEX_DIR = "/home/glakshmanan/index";

// Constructor
public LuceneIndexWriter() {
}

/**
* @param args
* @throws IOException
* @throws LockObtainFailedException
* @throws CorruptIndexException
* @throws ParseException
*/
public static void main(String[] args) throws CorruptIndexException,
LockObtainFailedException, IOException, ParseException {

LuceneIndexWriter luceneIndexWriter = new LuceneIndexWriter();

// Create IndexWriter object using config
IndexWriter iw = luceneIndexWriter.createIndexWriter();

// DeleteAll Documents
luceneIndexWriter.deleteAllDocuments(iw);

// Create Documents and index it
luceneIndexWriter.createDocument(iw);

// close the indexWriter
luceneIndexWriter.closeIndexWriter(iw);

// Delete the Document
// luceneIndexWriter.deleteDocument();

log.info("Index Writter written the data Sucessfully");
}

private void deleteAllDocuments(IndexWriter iw) throws IOException {
// Delete existing docs, segments and terms before we add new data
iw.deleteAll();
iw.commit();
}

/**
* This method will delete the Document from the indexDirectory using the
* Term
*
* @throws IOException
*/
@SuppressWarnings("unused")
private void deleteDocument() throws IOException {
Directory dir = NIOFSDirectory.open(new File(INDEX_DIR));
IndexReader ir = IndexReader.open(dir);
ir.deleteDocuments(new Term("name"));
// This will flush the commit and close
ir.close();
}

/**
* This method will commit changes and close the IndexWriter object
*
* @param iw
* @throws IOException
*/
private void closeIndexWriter(IndexWriter iw) throws IOException {
// Optimize, commit and close the IndexWriter
iw.optimize();
iw.commit();
iw.close();
}

/**
* This method will create document with fields and write into FD using
* indexWriter
*
* @param iw
* @throws CorruptIndexException
* @throws IOException
*/
private void createDocument(IndexWriter iw) throws CorruptIndexException,
IOException {
// Creating document to add using indexWriter
Document doc = new Document();

// Value is just stored not indexed
Field idField = new Field("id", "1", Field.Store.YES, Field.Index.NO);
// Value is stored with Indexed, Analyzed and Tokenized
Field nameField = new Field("name", "Gubendran", Field.Store.YES,
Field.Index.ANALYZED);
// Field is not stored and not analyzed
Field addressField = new Field("address", "Jersey city",
Field.Store.NO, Field.Index.NOT_ANALYZED);

// Combined the searchFullText and store in 1 index to search easy when
// we need to show full content
String searchFullText = "Gubendran" + "Jersey City";
Field searchFullTextField = new Field("content", searchFullText,
Field.Store.YES, Field.Index.ANALYZED);

// Add fields into document
doc.add(idField);
doc.add(nameField);
doc.add(addressField);
doc.add(searchFullTextField);

// Adding NumericField Example
/*
* NumericField numericField = new NumericField("title",
* Field.Store.YES, true);
* numericField.setIntValue(Integer.parseInt(value));
* doc.add(numericField);
*/

iw.addDocument(doc);
}

/**
* This method will create indexWriter object and return using
* indexConfiguration
*
* @return
* @throws CorruptIndexException
* @throws LockObtainFailedException
* @throws IOException
*/
private IndexWriter createIndexWriter() throws CorruptIndexException,
LockObtainFailedException, IOException {
// 1. Create the index
File file = new File(INDEX_DIR);
// If its linux system you can use NIOFSDirectory to make faster
Directory index = NIOFSDirectory.open(file);

// 0. Specify the analyzer for tokenizing text.
// The same analyzer should be used for indexing and searching. For
// English use standardAnalyzer
StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);

// If the file size is huge. This is alternate for MaxFieldLength
LimitTokenCountAnalyzer limitTokenCountAnalyzer = new LimitTokenCountAnalyzer(
analyzer, Integer.MAX_VALUE);

// Documents will be stored in the segments. If there were more
// documents we need to keep in different segment to make faster
final int segmentSize = 10;
int mergeFactorSize = 100;
int RAMBufferSizeMB = 32;
int maxBufferedDocs = 500;
LogMergePolicy logMergePolicy = new LogMergePolicy() {

@Override
protected long size(SegmentInfo info) throws IOException {
return segmentSize;
}
};
// This parameter determines how many documents you can store in the
// original segment index and how often you can merge together the
// segment indexes in the disk
logMergePolicy.setMergeFactor(mergeFactorSize);

logMergePolicy.setUseCompoundFile(true);
// This parameter determines the maximum number of documents per segment
// index. The default value is Integer.MAX_VALUE. Large values are
// better for batched indexing and speedier searches.
logMergePolicy.setMaxMergeDocs(Integer.MAX_VALUE);

IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_31,
limitTokenCountAnalyzer);
config.setRAMBufferSizeMB(RAMBufferSizeMB);
config.setMergeScheduler(new SerialMergeScheduler());
config.setMaxBufferedDocs(maxBufferedDocs);

IndexWriter iw = new IndexWriter(index, config);
return iw;
}
}

Lucene IndexSearcher

package pocUsingLucene;

import java.io.File;
import java.io.IOException;

import org.apache.log4j.Logger;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.lucene.util.Version;

/**
 * @author glakshmanan
 *
 */
public class LuceneIndexSearcher {

// Logger
public static final Logger log = Logger
.getLogger(LuceneIndexSearcher.class);

// IndexSearcher Directory
private static final String INDEX_DIR = "/home/glakshmanan/index";

// Constructor
public LuceneIndexSearcher() {
super();
}

/**
* @param args
* @throws IOException
* @throws ParseException
*/
public static void main(String[] args) throws IOException, ParseException {

// Get the argument for Search
String searchText = args.length > 0 ? args[0] : "dummies";

LuceneIndexSearcher luceneIndexSearcher = new LuceneIndexSearcher();
// Create indexSearcher Object for search
IndexSearcher searcher = luceneIndexSearcher.createIndexSearcher();

// Create QueryParser Object and Collector object for search
Query queryParser = luceneIndexSearcher.createQueryParser(searchText);

// Using TopScoreDocCollector collect the documents for the
// corresponding search value. hitsPerPage is important for usage of
// paging and Filter. Paging and Filter needed to make search faster
int hitsPerPage = 10;
TopScoreDocCollector collector = TopScoreDocCollector.create(
hitsPerPage, true);
searcher.search(queryParser, collector);

ScoreDoc[] scoreDocs = collector.topDocs().scoreDocs;

// Display the search values using top scored docs
luceneIndexSearcher.displaySearchResult(searcher, scoreDocs);

// searcher can only be closed when there
// is no need to access the documents any more.
searcher.close();
}

/**
* Display the search value result from ScoreDocs
*
* @param searcher
* @param scoreDocs
* @throws CorruptIndexException
* @throws IOException
*/
private void displaySearchResult(IndexSearcher searcher,
ScoreDoc[] scoreDocs) throws CorruptIndexException, IOException {
log.info("List of docs found : " + scoreDocs.length);
for (ScoreDoc scoreDoc : scoreDocs) {
Document doc = searcher.doc(scoreDoc.doc);
log.info("Name : " + doc.get("name"));
}
}

/**
* Create QueryParser object for indexSearcher using search text
*
* @param searchText
* @return
* @throws ParseException
*/
private Query createQueryParser(String searchText) throws ParseException {

// Create a queryParserObject for Search using search field and
// search value. Use same Analyzer you used for indexWriter to
// indexSearch also
Query queryParser = new QueryParser(Version.LUCENE_31, "name",
new StandardAnalyzer(Version.LUCENE_31)).parse(searchText);

// NumericRangeQuery search for numeric search
// Query queryParser = NumericRangeQuery.newIntRange("title", 4, 40,
// 500, true, true);
return queryParser;
}

/**
* Create IndexSearcher object for searcher
*
* @return
* @throws IOException
*/
private IndexSearcher createIndexSearcher() throws IOException {
// 1. Create the index
File file = new File(INDEX_DIR);
Directory index = NIOFSDirectory.open(file);

// Create indexSearcher Object using index Directory
IndexSearcher indexSearcher = new IndexSearcher(index, true);

return indexSearcher;
}

}



No comments :

// Below script tag for SyntaxHighLighter