/*
 * Decompiled with CFR 0.152.
 */
package org.apache.mahout.utils.clustering;

import com.google.common.base.Charsets;
import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
import com.google.common.io.Files;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.net.URI;
import java.nio.charset.Charset;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import org.apache.commons.cli2.Option;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.clustering.cdbw.CDbwEvaluator;
import org.apache.mahout.clustering.evaluation.ClusterEvaluator;
import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.ClassUtils;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
import org.apache.mahout.utils.clustering.AbstractClusterWriter;
import org.apache.mahout.utils.clustering.CSVClusterWriter;
import org.apache.mahout.utils.clustering.ClusterDumperWriter;
import org.apache.mahout.utils.clustering.ClusterWriter;
import org.apache.mahout.utils.clustering.GraphMLClusterWriter;
import org.apache.mahout.utils.vectors.VectorHelper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public final class ClusterDumper
extends AbstractJob {
    public static final String SAMPLE_POINTS = "samplePoints";
    protected DistanceMeasure measure;
    public static final String OUTPUT_OPTION = "output";
    public static final String DICTIONARY_TYPE_OPTION = "dictionaryType";
    public static final String DICTIONARY_OPTION = "dictionary";
    public static final String POINTS_DIR_OPTION = "pointsDir";
    public static final String NUM_WORDS_OPTION = "numWords";
    public static final String SUBSTRING_OPTION = "substring";
    public static final String SEQ_FILE_DIR_OPTION = "seqFileDir";
    public static final String EVALUATE_CLUSTERS = "evaluate";
    public static final String OUTPUT_FORMAT_OPT = "outputFormat";
    private static final Logger log = LoggerFactory.getLogger(ClusterDumper.class);
    private Path seqFileDir;
    private Path pointsDir;
    private long maxPointsPerCluster = Long.MAX_VALUE;
    private String termDictionary;
    private String dictionaryFormat;
    private String outputFile;
    private int subString = Integer.MAX_VALUE;
    private int numTopFeatures = 10;
    private Map<Integer, List<WeightedVectorWritable>> clusterIdToPoints;
    private OUTPUT_FORMAT outputFormat = OUTPUT_FORMAT.TEXT;
    private boolean runEvaluation;

    public ClusterDumper(Path seqFileDir, Path pointsDir) {
        this.seqFileDir = seqFileDir;
        this.pointsDir = pointsDir;
        this.init();
    }

    public ClusterDumper() {
        this.setConf(new Configuration());
    }

    public static void main(String[] args) throws Exception {
        new ClusterDumper().run(args);
    }

    public int run(String[] args) throws Exception {
        int sub;
        this.addOption(SEQ_FILE_DIR_OPTION, "s", "The directory containing Sequence Files for the Clusters", true);
        this.addOption(OUTPUT_OPTION, "o", "Optional output directory. Default is to output to the console.");
        this.addOption(OUTPUT_FORMAT_OPT, "of", "The optional output format to write the results as.  Options: TEXT, CSV or GRAPH_ML", "TEXT");
        this.addOption(SUBSTRING_OPTION, "b", "The number of chars of the asFormatString() to print");
        this.addOption(NUM_WORDS_OPTION, "n", "The number of top terms to print");
        this.addOption(POINTS_DIR_OPTION, "p", "The directory containing points sequence files mapping input vectors to their cluster.  If specified, then the program will output the points associated with a cluster");
        this.addOption(SAMPLE_POINTS, "sp", "Specifies the maximum number of points to include _per_ cluster.  The default is to include all points");
        this.addOption(DICTIONARY_OPTION, "d", "The dictionary file");
        this.addOption(DICTIONARY_TYPE_OPTION, "dt", "The dictionary file type (text|sequencefile)", "text");
        this.addOption(ClusterDumper.buildOption((String)EVALUATE_CLUSTERS, (String)"e", (String)"Run ClusterEvaluator and CDbwEvaluator over the input.  The output will be appended to the rest of the output at the end.", (boolean)false, (boolean)false, null));
        this.addOption((Option)DefaultOptionCreator.distanceMeasureOption().create());
        if (this.parseArguments(args) == null) {
            return -1;
        }
        this.seqFileDir = new Path(this.getOption(SEQ_FILE_DIR_OPTION));
        if (this.hasOption(POINTS_DIR_OPTION)) {
            this.pointsDir = new Path(this.getOption(POINTS_DIR_OPTION));
        }
        this.outputFile = this.getOption(OUTPUT_OPTION);
        if (this.hasOption(SUBSTRING_OPTION) && (sub = Integer.parseInt(this.getOption(SUBSTRING_OPTION))) >= 0) {
            this.subString = sub;
        }
        this.termDictionary = this.getOption(DICTIONARY_OPTION);
        this.dictionaryFormat = this.getOption(DICTIONARY_TYPE_OPTION);
        if (this.hasOption(NUM_WORDS_OPTION)) {
            this.numTopFeatures = Integer.parseInt(this.getOption(NUM_WORDS_OPTION));
        }
        if (this.hasOption(OUTPUT_FORMAT_OPT)) {
            this.outputFormat = OUTPUT_FORMAT.valueOf(this.getOption(OUTPUT_FORMAT_OPT));
        }
        this.maxPointsPerCluster = this.hasOption(SAMPLE_POINTS) ? Long.parseLong(this.getOption(SAMPLE_POINTS)) : Long.MAX_VALUE;
        this.runEvaluation = this.hasOption(EVALUATE_CLUSTERS);
        String distanceMeasureClass = this.getOption("distanceMeasure");
        this.measure = (DistanceMeasure)ClassUtils.instantiateAs((String)distanceMeasureClass, DistanceMeasure.class);
        this.init();
        this.printClusters(null);
        return 0;
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public void printClusters(String[] dictionary) throws Exception {
        Writer writer;
        boolean shouldClose;
        Configuration conf = new Configuration();
        if (this.termDictionary != null) {
            if ("text".equals(this.dictionaryFormat)) {
                dictionary = VectorHelper.loadTermDictionary(new File(this.termDictionary));
            } else if ("sequencefile".equals(this.dictionaryFormat)) {
                dictionary = VectorHelper.loadTermDictionary(conf, this.termDictionary);
            } else {
                throw new IllegalArgumentException("Invalid dictionary format");
            }
        }
        if (this.outputFile == null) {
            shouldClose = false;
            writer = new OutputStreamWriter(System.out);
        } else {
            shouldClose = true;
            if (this.outputFile.startsWith("s3n://")) {
                Path p = new Path(this.outputFile);
                FileSystem fs = FileSystem.get((URI)p.toUri(), (Configuration)conf);
                writer = new OutputStreamWriter((OutputStream)fs.create(p), Charsets.UTF_8);
            } else {
                writer = Files.newWriter((File)new File(this.outputFile), (Charset)Charsets.UTF_8);
            }
        }
        ClusterWriter clusterWriter = this.createClusterWriter(writer, dictionary);
        try {
            long numWritten = clusterWriter.write((Iterable<Cluster>)new SequenceFileDirValueIterable(new Path(this.seqFileDir, "part-*"), PathType.GLOB, conf));
            writer.flush();
            if (this.runEvaluation) {
                HadoopUtil.delete((Configuration)conf, (Path[])new Path[]{new Path("tmp/representative")});
                int numIters = 5;
                RepresentativePointsDriver.main(new String[]{"--input", this.seqFileDir.toString(), "--output", "tmp/representative", "--clusteredPoints", this.pointsDir.toString(), "--distanceMeasure", this.measure.getClass().getName(), "--maxIter", String.valueOf(numIters)});
                conf.set("org.apache.mahout.clustering.measure", this.measure.getClass().getName());
                conf.set("org.apache.mahout.clustering.stateIn", "tmp/representative/representativePoints-" + numIters);
                ClusterEvaluator ce = new ClusterEvaluator(conf, this.seqFileDir);
                writer.append("\n");
                writer.append("Inter-Cluster Density: ").append(String.valueOf(ce.interClusterDensity())).append("\n");
                writer.append("Intra-Cluster Density: ").append(String.valueOf(ce.intraClusterDensity())).append("\n");
                CDbwEvaluator cdbw = new CDbwEvaluator(conf, this.seqFileDir);
                writer.append("CDbw Inter-Cluster Density: ").append(String.valueOf(cdbw.interClusterDensity())).append("\n");
                writer.append("CDbw Intra-Cluster Density: ").append(String.valueOf(cdbw.intraClusterDensity())).append("\n");
                writer.append("CDbw Separation: ").append(String.valueOf(cdbw.separation())).append("\n");
                writer.flush();
            }
            log.info("Wrote {} clusters", (Object)numWritten);
        }
        finally {
            if (shouldClose) {
                Closeables.closeQuietly((Closeable)clusterWriter);
            } else if (clusterWriter instanceof GraphMLClusterWriter) {
                clusterWriter.close();
            }
        }
    }

    ClusterWriter createClusterWriter(Writer writer, String[] dictionary) throws IOException {
        AbstractClusterWriter result = null;
        switch (this.outputFormat) {
            case TEXT: {
                result = new ClusterDumperWriter(writer, this.clusterIdToPoints, this.measure, this.numTopFeatures, dictionary, this.subString);
                break;
            }
            case CSV: {
                result = new CSVClusterWriter(writer, this.clusterIdToPoints, this.measure);
                break;
            }
            case GRAPH_ML: {
                result = new GraphMLClusterWriter(writer, this.clusterIdToPoints, this.measure, this.numTopFeatures, dictionary, this.subString);
            }
        }
        return result;
    }

    private void init() {
        if (this.pointsDir != null) {
            Configuration conf = new Configuration();
            this.clusterIdToPoints = ClusterDumper.readPoints(this.pointsDir, this.maxPointsPerCluster, conf);
        } else {
            this.clusterIdToPoints = Collections.emptyMap();
        }
    }

    public String getOutputFile() {
        return this.outputFile;
    }

    public void setOutputFile(String outputFile) {
        this.outputFile = outputFile;
    }

    public int getSubString() {
        return this.subString;
    }

    public void setSubString(int subString) {
        this.subString = subString;
    }

    public Map<Integer, List<WeightedVectorWritable>> getClusterIdToPoints() {
        return this.clusterIdToPoints;
    }

    public String getTermDictionary() {
        return this.termDictionary;
    }

    public void setTermDictionary(String termDictionary, String dictionaryType) {
        this.termDictionary = termDictionary;
        this.dictionaryFormat = dictionaryType;
    }

    public void setNumTopFeatures(int num) {
        this.numTopFeatures = num;
    }

    public int getNumTopFeatures() {
        return this.numTopFeatures;
    }

    public long getMaxPointsPerCluster() {
        return this.maxPointsPerCluster;
    }

    public void setMaxPointsPerCluster(long maxPointsPerCluster) {
        this.maxPointsPerCluster = maxPointsPerCluster;
    }

    public static Map<Integer, List<WeightedVectorWritable>> readPoints(Path pointsPathDir, long maxPointsPerCluster, Configuration conf) {
        TreeMap<Integer, List<WeightedVectorWritable>> result = new TreeMap<Integer, List<WeightedVectorWritable>>();
        for (Pair record : new SequenceFileDirIterable(pointsPathDir, PathType.LIST, PathFilters.logsCRCFilter(), conf)) {
            int keyValue = ((IntWritable)record.getFirst()).get();
            List pointList = (List)result.get(keyValue);
            if (pointList == null) {
                pointList = Lists.newArrayList();
                result.put(keyValue, pointList);
            }
            if ((long)pointList.size() >= maxPointsPerCluster) continue;
            pointList.add(record.getSecond());
        }
        return result;
    }

    public static enum OUTPUT_FORMAT {
        TEXT,
        CSV,
        GRAPH_ML;

    }
}

