/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.scoring.similarity.cosine;

import java.io.BufferedReader;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.StringUtils;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.nutch.scoring.similarity.cosine.DocVector;
import org.apache.nutch.scoring.similarity.util.LuceneAnalyzerUtil;
import org.apache.nutch.scoring.similarity.util.LuceneTokenizer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class Model {
    public static ArrayList<DocVector> docVectors = new ArrayList();
    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
    public static boolean isModelCreated = false;
    private static List<String> stopWords;

    public static synchronized void createModel(Configuration conf) throws IOException {
        if (isModelCreated) {
            LOG.info("Model exists, skipping model creation");
            return;
        }
        LOG.info("Creating Cosine model");
        try {
            String line;
            if (!conf.get("scoring.similarity.stopword.file").equals("stopwords.txt.template")) {
                String stopWord;
                stopWords = new ArrayList<String>();
                BufferedReader br = new BufferedReader(conf.getConfResourceAsReader(conf.get("scoring.similarity.stopword.file")));
                while ((stopWord = br.readLine()) != null) {
                    stopWords.add(stopWord);
                }
                LOG.info("Loaded custom stopwords from {}", (Object)conf.get("scoring.similarity.stopword.file"));
            }
            int[] ngramArr = Model.retrieveNgrams(conf);
            int mingram = ngramArr[0];
            int maxgram = ngramArr[1];
            LOG.info("Value of mingram: {} maxgram: {}", (Object)mingram, (Object)maxgram);
            StringBuilder sb = new StringBuilder();
            BufferedReader br = new BufferedReader(conf.getConfResourceAsReader(conf.get("cosine.goldstandard.file")));
            while ((line = br.readLine()) != null) {
                sb.append(line);
            }
            DocVector goldStandard = Model.createDocVector(sb.toString(), mingram, maxgram);
            if (goldStandard == null) {
                throw new Exception("Could not create DocVector for goldstandard");
            }
            docVectors.add(goldStandard);
        }
        catch (Exception e) {
            LOG.warn("Failed to add {} to model : {}", (Object)conf.get("cosine.goldstandard.file", "goldstandard.txt.template"), (Object)StringUtils.stringifyException((Throwable)e));
        }
        if (docVectors.size() > 0) {
            LOG.info("Cosine model creation complete");
            isModelCreated = true;
        } else {
            LOG.info("Cosine model creation failed");
        }
    }

    public static DocVector createDocVector(String content, int mingram, int maxgram) {
        LuceneTokenizer tokenizer;
        if (mingram > 1 && maxgram > 1) {
            LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", (Object)mingram, (Object)maxgram);
            tokenizer = new LuceneTokenizer(content, LuceneTokenizer.TokenizerType.STANDARD, LuceneAnalyzerUtil.StemFilterType.PORTERSTEM_FILTER, mingram, maxgram);
        } else if (mingram > 1) {
            maxgram = mingram;
            LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", (Object)mingram, (Object)maxgram);
            tokenizer = new LuceneTokenizer(content, LuceneTokenizer.TokenizerType.STANDARD, LuceneAnalyzerUtil.StemFilterType.PORTERSTEM_FILTER, mingram, maxgram);
        } else {
            tokenizer = stopWords != null ? new LuceneTokenizer(content, LuceneTokenizer.TokenizerType.STANDARD, stopWords, true, LuceneAnalyzerUtil.StemFilterType.PORTERSTEM_FILTER) : new LuceneTokenizer(content, LuceneTokenizer.TokenizerType.STANDARD, true, LuceneAnalyzerUtil.StemFilterType.PORTERSTEM_FILTER);
        }
        TokenStream tStream = tokenizer.getTokenStream();
        HashMap<String, Integer> termVector = new HashMap<String, Integer>();
        try {
            CharTermAttribute charTermAttribute = (CharTermAttribute)tStream.addAttribute(CharTermAttribute.class);
            tStream.reset();
            while (tStream.incrementToken()) {
                String term = charTermAttribute.toString();
                LOG.debug(term);
                if (termVector.containsKey(term)) {
                    int count = termVector.get(term);
                    termVector.put(term, ++count);
                    continue;
                }
                termVector.put(term, 1);
            }
            DocVector docVector = new DocVector();
            docVector.setTermFreqVector(termVector);
            return docVector;
        }
        catch (IOException e) {
            LOG.error("Error creating DocVector : {}", (Object)StringUtils.stringifyException((Throwable)e));
            return null;
        }
    }

    public static float computeCosineSimilarity(DocVector docVector) {
        float[] scores = new float[docVectors.size()];
        int i = 0;
        float maxScore = 0.0f;
        for (DocVector corpusDoc : docVectors) {
            float numerator = docVector.dotProduct(corpusDoc);
            float denominator = docVector.getL2Norm() * corpusDoc.getL2Norm();
            float currentScore = numerator / denominator;
            scores[i++] = currentScore;
            maxScore = currentScore > maxScore ? currentScore : maxScore;
        }
        return maxScore;
    }

    public static int[] retrieveNgrams(Configuration conf) {
        int[] ngramArr = new int[2];
        String[] ngramStr = conf.getStrings("scoring.similarity.ngrams", new String[]{"1,1"});
        ngramArr[0] = Integer.parseInt(ngramStr[0]);
        ngramArr[1] = ngramStr.length > 1 ? Integer.parseInt(ngramStr[1]) : ngramArr[0];
        return ngramArr;
    }
}

