/**
 * Use of the WebDLPIndexer and related source code is subject to the
 * terms of the following license:
 *
 * Copyright (c) 2013 Carnegie Mellon University. All Rights Reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following acknowledgments and disclaimers.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. All advertising materials for third-party software mentioning features or
 * use of this software must display the following disclaimer:
 *
 * “Neither Carnegie Mellon University nor its Software Engineering Institute
 * have reviewed or endorsed this software”
 *
 * 4. The names “Carnegie Mellon University,” "CERT” and/or “Software
 * Engineering Institute" shall not be used to endorse or promote products
 * derived from this software without prior written permission. For written
 * permission, please contact permission@sei.cmu.edu.
 *
 * 5. Products derived from this software may not be called "CERT" nor may
 * "CERT" appear in their names without prior written permission of
 * permission@sei.cmu.edu.
 *
 * 6. Redistributions of any form whatsoever must retain the following
 * acknowledgment:
 *
 * "This product includes software developed by CERT with funding and support
 * from the US Government under Contract No. FA8721-05-C-0003. The U.S.
 * Government's rights to use, modify, reproduce, release, perform, display, or
 * disclose this material are restricted by the Rights in Technical
 * Data-Noncommercial Items clauses (DFAR 252-227.7013 and DFAR 252-227.7013
 * Alternate I contained in the foregoing identified contract.
 *
 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
 * CARNEGIE MELLON UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESS OR
 * IMPLIED, AS TO ANY MATTER, AND ALL SUCH WARRANTIES, INCLUDING WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE EXPRESSLY
 * DISCLAIMED. WITHOUT LIMITING THE GENERALITY OF THE FOREGOING, CARNEGIE MELLON
 * UNIVERSITY DOES NOT MAKE ANY WARRANTY OF ANY KIND RELATING TO EXCLUSIVITY,
 * INFORMATIONAL CONTENT, ERROR-FREE OPERATION, RESULTS TO BE OBTAINED FROM USE,
 * FREEDOM FROM PATENT, TRADEMARK AND COPYRIGHT INFRINGEMENT AND/OR FREEDOM FROM
 * THEFT OF TRADE SECRETS.”
 *
 */
package webdlpindexer.indexer;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import webdlpindexer.Logger;

/**
 * This class handles all direct manipulations of the Apache Lucene index.  Parts of this code were
 * adapted from Sujit Pal http://sujitpal.blogspot.com/2011/10/computing-document-similarity-using.html
 * 
 * @author Todd Lewellen
 */
public class Indexer {

    /**
     * The String identifier for the Apache Lucene index field for storing each
     * file path
     */
    public final static String FILE_PATH = "path";
    /**
     * The String identifier for the Apache Lucene index field for storing the
     * text contained in each file
     */
    public final static String FILE_CONTENTS = "contents";
    
    private File indexDir;
    private File intellectualPropertyDir;
    private boolean keepDelFiles;
    private boolean debug;  // This is not currently used, but is passed in the constructor in case future code requires its presence
    private Directory index;
    private IndexWriter writer;
    private IndexReader reader;
    private Analyzer analyzer;
    private IndexWriterConfig config;
    private HashMap<String, DocVector> docVectors;

    ////////////////////
    //  Constructors  //
    ////////////////////    
    /**
     * 
     * @param indexDir The directory in which to store the Apache Lucene index files
     */
    public Indexer(File indexDir) {
        this(indexDir, new File("property"));
    }

    /**
     * 
     * @param indexDirPath The directory in which to store the Apache Lucene index files
     */
    public Indexer(String indexDirPath) {
        this(new File(indexDirPath));
    }

    /**
     * 
     * @param indexDirPath The directory in which to store the Apache Lucene index files
     * @param intellectualPropertyDirPath The directory in which the intellectual property files are stored
     */
    public Indexer(String indexDirPath, String intellectualPropertyDirPath) {
        this(new File(indexDirPath), new File(intellectualPropertyDirPath));
    }

    /**
     * 
     * @param indexDir The directory in which to store the Apache Lucene index files
     * @param intellectualPropertyDir The directory in which the intellectual property files are stored
     */
    public Indexer(File indexDir, File intellectualPropertyDir) {
        this(indexDir, intellectualPropertyDir, false, false);
    }

    /**
     * 
     * @param indexDir The directory in which to store the Apache Lucene index files
     * @param intellectualPropertyDir The directory in which the intellectual property files are stored
     * @param keepDelFiles Whether or not files should be removed from the index when deleted from the filesystem
     * @param debug Whether or not debugging is enabled (provides extra logging)
     */
    public Indexer(File indexDir, File intellectualPropertyDir, boolean keepDelFiles, boolean debug) {
        this.indexDir = indexDir;
        this.intellectualPropertyDir = intellectualPropertyDir;
        this.keepDelFiles = keepDelFiles;
        this.debug = debug;

        analyzer = new StandardAnalyzer(Version.LUCENE_36);
    }

    //////////////////////
    // Indexing methods //
    //////////////////////
    /**
     * 
     * @return Returns true if all documents were indexed successfully
     */
    public boolean indexAll() {
        boolean success = true;

        //Delete the current index
        try {
            FileUtils.cleanDirectory(indexDir);
        } catch (IOException ex) {
            Logger.exception(ex);
        }

        prepWriter();
        //Index each document in the intellectualPropertyDir
        for (File f : intellectualPropertyDir.listFiles()) {
            if (!indexDocument(f)) {
                success = false;
            }
        }
        closeWriter();
        return success;
    }

    
    /**
     * Indexes a file into the Lucene index, recursing if the File object is a reference to a directory
     * @param f The file to index
     * @return Whether the file was successfully added to the index
     */
    public boolean indexDocument(File f) {
        boolean result = true;
        if(f.isDirectory()) {
            for(File childFile : f.listFiles()) {
                if(!indexDocument(childFile))
                    result = false;
            }
        }
        else {
            if(!indexFile(f))
                result = false;
        }
        return result;
    }

    
    /**
     * Indexes a document into the Lucene index
     * @param f A reference to the file to add to the index
     * @return Whether or not the file was successfully added to the index
     */
    public boolean indexFile(File f) {
        boolean success = true;
        
        //Determine if the writer has already been prepped & opened
        //If not, store this fact in a boolean, open the writer, and close the writer before the end of the method
        boolean singleUseWriter = (writer == null) ? true : false;
        if (singleUseWriter) {
            prepWriter();
        }

        FileInputStream input = null;

        try {
            //The following code (until the end of the 'try') uses Apache Tika to parse the document
            //and then uses Apache Lucene to add it to the index
            File file = new File(f.getAbsolutePath());
            input = new FileInputStream(file);            
            ContentHandler contenthandler = new BodyContentHandler(-1); //Create the ContentHandler with an infinite write limit 
            Metadata metadata = new Metadata();
            metadata.set(Metadata.RESOURCE_NAME_KEY, f.getAbsolutePath());
            Parser parser = new AutoDetectParser();
            ParseContext context = new ParseContext();
            context.set(Parser.class, parser);
            try {
                parser.parse(input, contenthandler, metadata, context);
                Document doc = new Document();
                doc.add(new Field(FILE_PATH, f.getAbsolutePath(), Field.Store.YES, Field.Index.NOT_ANALYZED));
                doc.add(new Field(FILE_CONTENTS, contenthandler.toString(), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
                writer.addDocument(doc);
            } catch (SAXException ex) {
                Logger.exception(ex);
                success = false;
            } catch (TikaException ex) {
                Logger.exception(ex);
                success = false;
            }
        } catch (FileNotFoundException ex) {
            success = false;
            Logger.exception(ex);
        } catch (CorruptIndexException ex) {
            success = false;
            Logger.exception(ex);
        } catch (LockObtainFailedException ex) {
            Logger.exception(ex);
            success = false;
        } catch (IOException ex) {
            Logger.exception(ex);
            success = false;
        } finally {
            try {
                //Close the FileInputStream
                if(input != null)
                    input.close();
            } catch (IOException ex) {
                Logger.exception(ex);
            }
            //Close the writer if it hadn't been opened before this method call
            if (singleUseWriter) {
                closeWriter();
            }
        }

        return success;
    }

    /**
     * 
     * @param f A reference to the file to remove from the index
     */
    public void removeDocument(File f) {
        if(!keepDelFiles)
            forceRemoveDocument(f);
    }
    
    /**
     * Will remove the document from the index, regardless if the system was initialized with a parameter to keep deleted documents
     * @param f A reference to the file to remove from the index
     */
    public void forceRemoveDocument(File f){
        prepWriter();
        try {
            writer.deleteDocuments(new Term(FILE_PATH, f.getAbsolutePath()));
        } catch (CorruptIndexException ex) {
            Logger.exception(ex);
        } catch (IOException ex) {
            Logger.exception(ex);
        }
        closeWriter();
    }

    /**
     * 
     * @param f A reference to the file to modify in the index
     */
    public void modifyDocument(File f) {
        //If keepDelFiles is true, we want to temporarily disable it.  Otherwise,
        //every version of this document will be indexed and never removed from the index
        //even when a new version is saved.  Perhaps this is an option for future
        //development... "keepAllVersions"
        boolean tempDisable_keepDelFiles = false;
        
        if(keepDelFiles){
            tempDisable_keepDelFiles = true;
            keepDelFiles = false;
        }
        
        //Remove the document from the index
        removeDocument(f); 
        
        if(tempDisable_keepDelFiles)
            keepDelFiles = true;
        
        //Add the document back to the index
        indexDocument(f);
    }

    /**
     * Generic debugging function which will print the names of all of the files stored in the index
     * @return the names of each of the files in the index, separated by a space
     */
    public String getIndexInfo(){
        String message = "The index at " + indexDir.getAbsolutePath() + " contains the following documents: \n";
        prepReader();
        try {            
            for(int i = 0; i < reader.maxDoc(); i++){
                String filename = reader.document(i).get(FILE_PATH);
                int path = filename.lastIndexOf("/");
                filename = filename.substring(path+1);
                message += (filename + ", ");
            }
        } catch (IOException ex) {
            Logger.exception(ex);
        }
        message += "\n";
        closeReader();
        return message;
    }
    
    // Utilitiy Indexing Methods
    private void prepWriter() {
        try {
            //Open the writer
            index = FSDirectory.open(indexDir);
            config = new IndexWriterConfig(Version.LUCENE_36, analyzer);
            writer = new IndexWriter(index, config);
        } catch (IOException ex) {
            Logger.exception(ex);
        }
    }

    private void prepReader() {
        try {
            reader = IndexReader.open(FSDirectory.open(indexDir));
        } catch (IOException ex) {
            Logger.exception(ex);
        }
    }

    private void closeWriter() {
        try {
            //Close the writer
            writer.close();
            index.close();
            writer = null;
            index = null;
        } catch (CorruptIndexException ex) {
            Logger.exception(ex);
        } catch (IOException ex) {
            Logger.exception(ex);
        }
    }

    private void closeReader() {
        try {
            reader.close();
            reader = null;
        } catch (IOException ex) {
            Logger.exception(ex);
        }
    }

    ////////////////////////////////
    // Similarity Testing Methods //
    ////////////////////////////////    
    /**
     * 
     * @param file The file to compare against the index
     * @return A HashMap of filenames and the percentages by which each of those files match against the input file
     */
    public HashMap<String, Double> testSimilarity(File file) {
        
        //First add the file to the index
        indexDocument(file);
        
        //Update the current DocVectors found in index
        updateDocVectors();
                
        int docId = findDocumentId(file);
        prepReader();
        //If the file to compare was not found in the index, return null
        if (docId == -1) {
            Logger.error("File " + file.getName() + " was not found in the index");
            return null;
        }

        DocVector vector = docVectors.get(file.getAbsolutePath());
        HashMap<String, Double> resultsMap = new HashMap<String, Double>();
        Iterator iterator = docVectors.entrySet().iterator();
        while (iterator.hasNext()) {
            Map.Entry pairs = (Map.Entry) iterator.next();
            if (!pairs.getKey().equals(file.getAbsolutePath())) {
                double cosim = getCosineSimilarity(vector, (DocVector) pairs.getValue());
                if (Double.isNaN(cosim)) {
                    cosim = 0.0;
                }
                resultsMap.put((String) pairs.getKey(), cosim);
            }            
        }
        closeReader();
        return resultsMap;
    }

    /**
     * 
     * @param file The file to compare against the index
     * @param threshold A number between 0 and 100.  If the score for each compared file is greater than this number, it will be returned in HashMap
     * @return A HashMap of filenames and the percentage score by which each of those files match against the input file.  Only entries where the score is above the input threshold will be returned.
     */
    public HashMap<String,Double> testSimilarityWithThreshold(File file, int threshold) {
        double thresh = threshold / 100.0;
        HashMap<String, Double> map = testSimilarity(file);
        Iterator iterator = map.entrySet().iterator();
        while (iterator.hasNext()) {
            Map.Entry pairs = (Map.Entry) iterator.next();
            if ((Double) pairs.getValue() < thresh) {
                iterator.remove();
            }            
        }
        return map;
    }

    //Returns the Cosine similarity between the two DocVector objects
    private double getCosineSimilarity(DocVector d1, DocVector d2) {
        return (d1.vector.dotProduct(d2.vector)) / (d1.vector.getNorm() * d2.vector.getNorm());
    }

    //Iterates through the index and rebuilds an up-to-date HashMap of filenames and their DocVectors in relation to the current index
    private void updateDocVectors() {
        docVectors = new HashMap<String, DocVector>();
        try {
            prepReader();
            // first find all terms in the index
            Map<String, Integer> terms = new HashMap<String, Integer>();
            TermEnum termEnum = reader.terms(new Term(FILE_CONTENTS));
            int pos = 0;
            while (termEnum.next()) {
                Term term = termEnum.term();
                if (!FILE_CONTENTS.equals(term.field())) {
                    break;
                }
                terms.put(term.text(), pos++);
            }

            for (int i = 0; i < reader.maxDoc(); i++) {
                TermFreqVector[] tfvs = reader.getTermFreqVectors(i);
                DocVector vector = new DocVector(terms);
                if (tfvs != null) {
                    for (TermFreqVector tfv : tfvs) {
                        String[] termTexts = tfv.getTerms();
                        int[] termFreqs = tfv.getTermFrequencies();
                        for (int j = 0; j < termTexts.length; j++) {
                            vector.setEntry(termTexts[j], termFreqs[j]);
                        }
                    }
                }
                vector.normalize();
                docVectors.put(reader.document(i).get(FILE_PATH).toString(), vector);
            }
        } catch (IOException ex) {
            Logger.exception(ex);
        }
        closeReader();
    }

    //Finds the documentID of f in relation to the current index
    private int findDocumentId(File f) {
        prepReader();
        try {
            for (int i = 0; i < reader.maxDoc(); i++) {
                if (reader.document(i).get(FILE_PATH).equals(f.getAbsolutePath())) {
                    return i;
                }
            }
        } catch (CorruptIndexException ex) {
            Logger.exception(ex);
        } catch (IOException ex) {
            Logger.exception(ex);
        }
        closeReader();
        return -1;
    }
}
