/*
 * Decompiled with CFR 0.152.
 */
package org.carrot2.clustering.lingo;

import com.carrotsearch.hppc.BitSet;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.carrot2.attrs.AttrComposite;
import org.carrot2.attrs.AttrDouble;
import org.carrot2.attrs.AttrInteger;
import org.carrot2.attrs.AttrObject;
import org.carrot2.attrs.AttrString;
import org.carrot2.clustering.Cluster;
import org.carrot2.clustering.ClusteringAlgorithm;
import org.carrot2.clustering.Document;
import org.carrot2.clustering.SharedInfrastructure;
import org.carrot2.clustering.lingo.ClusterBuilder;
import org.carrot2.clustering.lingo.LingoProcessingContext;
import org.carrot2.internal.clustering.ClusteringAlgorithmUtilities;
import org.carrot2.language.EphemeralDictionaries;
import org.carrot2.language.LabelFilter;
import org.carrot2.language.LanguageComponents;
import org.carrot2.language.Stemmer;
import org.carrot2.language.StopwordFilter;
import org.carrot2.language.Tokenizer;
import org.carrot2.text.preprocessing.CompletePreprocessingPipeline;
import org.carrot2.text.preprocessing.LabelFormatter;
import org.carrot2.text.preprocessing.PreprocessingContext;
import org.carrot2.text.vsm.ReducedVectorSpaceModelContext;
import org.carrot2.text.vsm.TermDocumentMatrixBuilder;
import org.carrot2.text.vsm.TermDocumentMatrixReducer;
import org.carrot2.text.vsm.VectorSpaceModelContext;

public class LingoClusteringAlgorithm
extends AttrComposite
implements ClusteringAlgorithm {
    public static final String NAME = "Lingo";
    private static final Set<Class<?>> REQUIRED_LANGUAGE_COMPONENTS = new HashSet<Class>(Arrays.asList(Stemmer.class, Tokenizer.class, StopwordFilter.class, LabelFilter.class, LabelFormatter.class));
    public AttrDouble scoreWeight;
    public AttrInteger desiredClusterCount;
    public CompletePreprocessingPipeline preprocessing;
    public TermDocumentMatrixBuilder matrixBuilder;
    public TermDocumentMatrixReducer matrixReducer;
    public ClusterBuilder clusterBuilder;
    public EphemeralDictionaries dictionaries;
    public final AttrString queryHint;

    public LingoClusteringAlgorithm() {
        this.scoreWeight = this.attributes.register("scoreWeight", AttrDouble.builder().label("Size-score sorting ratio").min(0.0).max(1.0).defaultValue(0.0));
        this.desiredClusterCount = this.attributes.register("desiredClusterCount", AttrInteger.builder().label("Desired cluster count").min(2).max(100).defaultValue(30));
        this.attributes.register("preprocessing", ((AttrObject.Builder)AttrObject.builder(CompletePreprocessingPipeline.class).label("Input preprocessing components")).getset(() -> this.preprocessing, v -> {
            this.preprocessing = v;
        }).defaultValue(CompletePreprocessingPipeline::new));
        this.attributes.register("matrixBuilder", ((AttrObject.Builder)AttrObject.builder(TermDocumentMatrixBuilder.class).label("Term-document matrix builder")).getset(() -> this.matrixBuilder, v -> {
            this.matrixBuilder = v;
        }).defaultValue(TermDocumentMatrixBuilder::new));
        this.attributes.register("matrixReducer", ((AttrObject.Builder)AttrObject.builder(TermDocumentMatrixReducer.class).label("Term-document matrix reducer")).getset(() -> this.matrixReducer, v -> {
            this.matrixReducer = v;
        }).defaultValue(TermDocumentMatrixReducer::new));
        this.attributes.register("clusterBuilder", ((AttrObject.Builder)AttrObject.builder(ClusterBuilder.class).label("Cluster label supplier")).getset(() -> this.clusterBuilder, v -> {
            this.clusterBuilder = v;
        }).defaultValue(ClusterBuilder::new));
        ClusteringAlgorithmUtilities.registerDictionaries(this.attributes, () -> this.dictionaries, v -> {
            this.dictionaries = v;
        });
        this.queryHint = this.attributes.register("queryHint", SharedInfrastructure.queryHintAttribute());
    }

    @Override
    public Set<Class<?>> requiredLanguageComponents() {
        return REQUIRED_LANGUAGE_COMPONENTS;
    }

    @Override
    public <T extends Document> List<Cluster<T>> cluster(Stream<? extends T> docStream, LanguageComponents languageComponents) {
        List documents = docStream.collect(Collectors.toList());
        if (this.dictionaries != null) {
            languageComponents = this.dictionaries.override(languageComponents);
        }
        PreprocessingContext context = this.preprocessing.preprocess(documents.stream(), (String)this.queryHint.get(), languageComponents);
        List clusters = new ArrayList();
        if (context.hasLabels()) {
            VectorSpaceModelContext vsmContext = new VectorSpaceModelContext(context);
            ReducedVectorSpaceModelContext reducedVsmContext = new ReducedVectorSpaceModelContext(vsmContext);
            LingoProcessingContext lingoContext = new LingoProcessingContext(reducedVsmContext);
            TermDocumentMatrixBuilder matrixBuilder = this.matrixBuilder;
            matrixBuilder.buildTermDocumentMatrix(vsmContext);
            matrixBuilder.buildTermPhraseMatrix(vsmContext);
            this.matrixReducer.reduce(reducedVsmContext, LingoClusteringAlgorithm.computeClusterCount((Integer)this.desiredClusterCount.get(), documents.size()));
            this.clusterBuilder.buildLabels(lingoContext, matrixBuilder.termWeighting);
            this.clusterBuilder.assignDocuments(lingoContext);
            this.clusterBuilder.merge(lingoContext);
            LabelFormatter labelFormatter = lingoContext.preprocessingContext.languageComponents.get(LabelFormatter.class);
            int[] clusterLabelIndex = lingoContext.clusterLabelFeatureIndex;
            BitSet[] clusterDocuments = lingoContext.clusterDocuments;
            double[] clusterLabelScore = lingoContext.clusterLabelScore;
            for (int i = 0; i < clusterLabelIndex.length; ++i) {
                Cluster<Document> cluster = new Cluster<Document>();
                int labelFeature = clusterLabelIndex[i];
                if (labelFeature < 0) continue;
                cluster.addLabel(context.format(labelFormatter, labelFeature));
                cluster.setScore(clusterLabelScore[i]);
                BitSet bs = clusterDocuments[i];
                int bit = bs.nextSetBit(0);
                while (bit >= 0) {
                    cluster.addDocument((Document)documents.get(bit));
                    bit = bs.nextSetBit(bit + 1);
                }
                clusters.add(cluster);
            }
        }
        clusters = SharedInfrastructure.reorderByWeightedScoreAndSize(clusters, (Double)this.scoreWeight.get());
        return clusters;
    }

    static int computeClusterCount(int desiredClusterCountBase, int documentCount) {
        return Math.min((int)((double)desiredClusterCountBase / 10.0 * Math.sqrt(documentCount)), documentCount);
    }
}

