java SAP Hana Spark : Lucene Tester

There is no need to explain why someone needs Lucene .
There are also lots of samples over net. I am just putting our sample if anyone
encounters this link.
import java.io.File ;
import java.io.Reader ;
import java.io.Serializable ;
import java.io.StringReader ;
import java.util.HashMap ;
import java.util.List ;

import org.apache.lucene.analysis.Analyzer ;
import org.apache.lucene.analysis.standard.StandardAnalyzer ;
import org.apache.lucene.document.Document ;
import org.apache.lucene.document.Field ;
import org.apache.lucene.index.CorruptIndexException ;
import org.apache.lucene.index.IndexReader ;
import org.apache.lucene.index.IndexWriter ;
import org.apache.lucene.index.IndexWriter.MaxFieldLength ;
import org.apache.lucene.index.Term ;
import org.apache.lucene.queryParser.QueryParser ;
import org.apache.lucene.search.IndexSearcher ;
import org.apache.lucene.search.Query ;
import org.apache.lucene.search.ScoreDoc ;
import org.apache.lucene.search.Searcher ;
import org.apache.lucene.search.TermQuery ;
import org.apache.lucene.search.TopDocs ;
import org.apache.lucene.search.similar.MoreLikeThis ;
import org.apache.lucene.search.spell.Dictionary ;
import org.apache.lucene.search.spell.LuceneDictionary ;
import org.apache.lucene.search.spell.SpellChecker ;
import org.apache.lucene.store.Directory ;
import org.apache.lucene.store.FSDirectory ;
import org.apache.lucene.util.Version ;

public class YLuceneTester
{


    private final String indexDir = "D:\\indexDir" ;

    private final String spellDirPath = "D:\\spellDir" ;

    /**
     * create index
     */
    public boolean createIndex( ) throws Exception
    {
        //        if( true == ifIndexExist( ) )
        //        {
        //            return true ;
        //        }
        //        File dir = new File(dataDir);
        //        if(!dir.exists()){
        //            return false;
        //        }

        //File[] htmls = dir.listFiles();

        Directory fsDirectory = FSDirectory.open( new File( this.indexDir ) ) ;
        Analyzer analyzer = new StandardAnalyzer( Version.LUCENE_33 ) ;
        IndexWriter indexWriter = new IndexWriter( fsDirectory, analyzer, true, MaxFieldLength.UNLIMITED ) ;

        addDocument( indexWriter ) ;

        indexWriter.optimize( ) ;
        indexWriter.close( ) ;

        IndexReader indexReader = null ;
        try
        {
            indexReader = IndexReader.open( fsDirectory ) ;
            Dictionary dictionary = new LuceneDictionary( indexReader, "trans" ) ;
            FSDirectory spellDir = FSDirectory.open( new File( this.spellDirPath ) ) ;
            SpellChecker spellChecker = new SpellChecker( spellDir ) ;
            spellChecker.indexDictionary( dictionary ) ;
            spellChecker.close( );
        }
        finally
        {
            if( indexReader != null )
            {
                indexReader.close( ) ;
            }
        }
        return true ;

    }

    /**
     * Add one document to the Lucene index
     * @throws Exception 
     * @throws CorruptIndexException 
     */
    public void addDocumentDB( IndexWriter indexWriter ) throws CorruptIndexException, Exception
    {
        YOrganization org = YOrganization.getTopLevelOrganization( "TXG100" ) ;

        String hql = " Select y.trans,y.id FROM YEntityTranslation y where y.organization.id =  " + org.getId( ) ;

        HashMap parameters = new HashMap( ) ;

        List existingCatalogData = ( List )HibernateUtils.execHQL( hql, parameters, 0, 1000 ) ;          for( Object[ ] datas : existingCatalogData )         {             String trans = ( String )datas[ 0 ] ;             if( YClientUtils.isBlankTrim( trans ) )                 continue ;              System.err.println( trans ) ;             Document document = new Document( ) ;             //document.add( new Field( "path", path, Field.Store.YES, Field.Index.NO ) ) ;             document.add( new Field( "trans", trans, Field.Store.YES, Field.Index.ANALYZED ) ) ;              indexWriter.addDocument( document ) ;         }      }          public void addDocument( IndexWriter indexWriter ) throws CorruptIndexException, Exception     {         String[] items = new String[]{"African lion","African wild cat","African wild dog","dog","cat","lion"};                  for( String item : items )         {              Document document = new Document( ) ;             document.add( new Field( "trans", item, Field.Store.YES, Field.Index.ANALYZED ) ) ;              indexWriter.addDocument( document ) ;         }      }      public Query suggest( String queryString ,int distance) throws Exception     {         try         {             Directory fsDirectory = FSDirectory.open( new File( this.spellDirPath ) ) ;             SpellChecker spellChecker = new SpellChecker( fsDirectory ) ;             if( spellChecker.exist( queryString ) )             {                 return null ;             }             String[ ] similarWords = spellChecker.suggestSimilar( queryString, distance ) ;             if( similarWords.length == 0 )             {                 return null ;             }              System.err.println( " Term = " + queryString + " Suggestions :" ) ;             for( String similarWord : similarWords )             {                 System.err.println( " ) " + similarWord ) ;             }              return new TermQuery( new Term( "trans", similarWords[ 0 ] ) ) ;         }         catch( Exception e )         {             throw new Exception( e.getMessage( ) ) ;         }     }      public void searchIndex( String[ ] queryStrings ) throws Exception     {         Searcher searcher = new IndexSearcher( FSDirectory.open( new File( this.indexDir ) ) ) ;         QueryParser parser = new QueryParser( Version.LUCENE_CURRENT, "trans", new StandardAnalyzer( Version.LUCENE_CURRENT ) ) ;         for( String queryString : queryStrings )         {             System.out.println( "nsearching for: " + queryString ) ;             Query query = parser.parse( queryString ) ;             TopDocs results = searcher.search( query, 10 ) ;             System.out.println( "total hits: " + results.totalHits ) ;             ScoreDoc[ ] hits = results.scoreDocs ;             for( ScoreDoc hit : hits )             {                 Document doc = searcher.doc( hit.doc ) ;                 System.out.printf( "%5.3f %sn \n", hit.score, doc.get( "trans" ) ) ;             }         }         searcher.close( ) ;     }       /**      * judge if the index exists already      */     public boolean ifIndexExist( )     {         File directory = new File( this.indexDir ) ;         if( 0 < directory.listFiles( ).length )
        {
            return true ;
        }
        else
        {
            return false ;
        }
    }

    public String getIndexDir( )
    {
        return this.indexDir ;
    }


    public Query parse( String queryString ) throws Exception
    {
        QueryParser queryParser = new QueryParser( Version.LUCENE_CURRENT, "trans", new StandardAnalyzer( Version.LUCENE_CURRENT ) ) ;
        queryParser.setDefaultOperator( QueryParser.AND_OPERATOR ) ;
        return queryParser.parse( queryString ) ;
    }

    public void search( String queryString ,int distance ) throws Exception
    {
        long startTime = System.currentTimeMillis( ) ;
        IndexSearcher is = null ;
        FSDirectory spellDir = FSDirectory.open( new File( this.spellDirPath ) ) ;
        Directory fsDirectory = FSDirectory.open( new File( this.indexDir ) ) ;

        int minimumHits = 100 ;
        int minimumScore = 5 ;

        try
        {
            is = new IndexSearcher( fsDirectory ) ;
            Query query = parse( queryString ) ;

            TopDocs tdocs = is.search( query, 100 ) ;

            //Hits hits = is.search( query ) ;

            //            for( ScoreDoc sdoc :  tdocs.scoreDocs )
            //            {
            //                sdoc.
            //            }

            String suggestedQueryString = null ;
            if( tdocs.totalHits < minimumHits || tdocs.getMaxScore( ) < minimumScore )
            {
                Query didYouMean = suggest( queryString ,distance) ;
                if( didYouMean != null )
                {
                    suggestedQueryString = didYouMean.toString( "trans" ) ;
                }
            }

            long endTime = System.currentTimeMillis( ) ;

            //return new SearchResult( extractHits( hits ), hits.length( ), endTime - startTime, queryString, suggestedQueryString ) ;
        }
        finally
        {
            if( is != null )
            {
                is.close( ) ;
            }
        }
    }

    public void moreLikeThis( String text ) throws Exception
    {
        Directory fsDirectory = FSDirectory.open( new File( this.indexDir ) ) ;

        IndexReader indexReader = IndexReader.open( fsDirectory ) ;

        //        FuzzyLikeThisQuer flt = new FuzzyLikeThisQuery( 50, new StandardAnalyzer( ) ) ;
        //        flt.addTerms( "product critical update", "title", 0.75f, FuzzyQuery.defaultPrefixLength ) ;
        //        BooleanQuery q = ( BooleanQuery )flt.rewrite( r ) ;
        //        int minNumClauseMatches = Math.round( q.clauses( ).size( ) * 0.5f ) ;
        //        q.setMinimumNumberShouldMatch( minNumClauseMatches ) ;

        IndexSearcher is = new IndexSearcher( FSDirectory.open( new File( this.indexDir ) ) ) ;

        MoreLikeThis mlt = new MoreLikeThis( indexReader ) ;
        mlt.setFieldNames( new String[ ] { "trans" } ) ;

        mlt.setMinWordLen( 2 ) ;
        mlt.setBoost( true ) ;

        Reader reader = new StringReader( text ) ;

        //Create the query that we can then use to search the index
        Query query = mlt.like( reader ) ;

        //Search the index using the query and get the top 5 results
        TopDocs topDocs = is.search( query, 5 ) ;

        //Create an array to hold the quotes we are going to
        //pass back to the client

        for( ScoreDoc scoreDoc : topDocs.scoreDocs )
        {
            //This retrieves the actual Document from the index using
            //the document number. (scoreDoc.doc is an int that is the

            System.err.print( "--" + scoreDoc.toString( ) ) ;
        }
        
        is.close( );

    }
    
    public static void init()
    {
        YLuceneTester luceneTester = new YLuceneTester( ) ;
        try
        {
            luceneTester.createIndex( ) ;

            //luceneTester.searchIndex( new String[ ] { "Cleaner" } ) ;
            //TermQuery q = ( TermQuery )luceneTester.suggest( "Claner" ) ;
            //q.extractTerms( terms )

            //luceneTester.moreLikeThis( "Clean" ) ;

            //            luceneTester.search( "Cleaner" ) ;
            //            luceneTester.search( "Cordless " ) ;
            //
            //            luceneTester.suggest( "Cleane" ) ;
            //            luceneTester.suggest( "Clean" ) ;
            //            luceneTester.suggest( "Clnr" ) ;

        }
        catch( Exception e )
        {
            e.printStackTrace( ) ;
        }
    }
    
    public static void tests()
    {
        YLuceneTester luceneTester = new YLuceneTester( ) ;
        try
        {

//            luceneTester.searchIndex( new String[ ] { "Afri" } ) ;
//            luceneTester.searchIndex( new String[ ] { "African" } ) ;
//            luceneTester.searchIndex( new String[ ] { "Africax" } ) ;
//            TermQuery q = ( TermQuery )luceneTester.suggest( "Claner" ) ;
//            q.extractTerms( terms )

            //luceneTester.moreLikeThis( "dog" ) ;

                        luceneTester.search( "Afrieen" ,1 ) ;
                        luceneTester.search( "Afrieen" ,1 ) ;
                        luceneTester.search( "Afrieen" ,1 ) ;
            //            luceneTester.search( "Cordless " ) ;
            //
                        luceneTester.suggest( "Afrieen",2 ) ;
                        luceneTester.suggest( "lion" ,2) ;
            //            luceneTester.suggest( "Clnr" ) ;

        }
        catch( Exception e )
        {
            e.printStackTrace( ) ;
        }
    }

    public static void main( String[ ] args )
    {
        //HibernateUtils._configFileName = "hibernate.hqltest.xml" ;
        
        //init( );
        
        tests( );



    }
}
java SAP Hana Spark

Wednesday, August 19, 2015

Lucene Tester

2 comments:

Blog Archive

Labels