/*
 * Decompiled with CFR 0.152.
 */
package ca.pfv.spmf.algorithms.clustering.text_clusterer;

import ca.pfv.spmf.algorithms.clustering.text_clusterer.Record;
import ca.pfv.spmf.algorithms.clustering.text_clusterer.SimilarRecords;
import ca.pfv.spmf.algorithms.clustering.text_clusterer.TextCluster;
import ca.pfv.spmf.tools.MemoryLogger;
import ca.pfv.spmf.tools.textprocessing.PorterStemmer;
import ca.pfv.spmf.tools.textprocessing.StopWordAnalyzer;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;

public class TextClusterAlgo {
    private HashSet<String> allWords = new HashSet();
    private HashMap<Integer, Integer> idMap = new HashMap();
    private long startTimestamp = 0L;
    private long endTimeStamp = 0L;
    private boolean stemFlag;
    private boolean stopWordFlag;
    private PorterStemmer stemmer;

    public void runAlgorithm(String inputPath, String outputPath, boolean stemFlag, boolean stopWordFlag) {
        this.stemFlag = stemFlag;
        this.stopWordFlag = stopWordFlag;
        this.runAlgorithm(inputPath, outputPath);
    }

    public void runAlgorithm(String inputPath, String outputPath) {
        this.startTimestamp = System.currentTimeMillis();
        this.stemmer = new PorterStemmer();
        try {
            BufferedReader inputReader = new BufferedReader(new FileReader(new File(inputPath)));
            if (inputPath != null && outputPath != null) {
                BufferedWriter outputWriter = new BufferedWriter(new FileWriter(new File(outputPath)));
                ArrayList<Record> records = this.loadInput(inputReader, this.stemFlag, this.stopWordFlag);
                for (Record record : records) {
                    double[] tfIdfVector = new double[this.allWords.size()];
                    int vectorIncrementer = 0;
                    for (String word : this.allWords) {
                        tfIdfVector[vectorIncrementer] = this.FindTFIDF(record.getAttribute(), word, records);
                        ++vectorIncrementer;
                    }
                    record.setTfVector(tfIdfVector);
                }
                double[][] sim = new double[records.size()][records.size()];
                for (int i = 0; i < records.size(); ++i) {
                    for (int j = 0; j < records.size(); ++j) {
                        sim[i][j] = this.calculateSimilarity(records.get(i).getTfVector(), records.get(j).getTfVector());
                    }
                }
                ArrayList<SimilarRecords> similarRecordPairs = new ArrayList<SimilarRecords>();
                for (int i = 0; i < records.size(); ++i) {
                    double max = 0.0;
                    int ipos = 0;
                    int jpos = 0;
                    for (int j = 0; j < records.size(); ++j) {
                        if (i == j || !(sim[i][j] > max)) continue;
                        max = sim[i][j];
                        ipos = i;
                        jpos = j;
                    }
                    SimilarRecords pair = new SimilarRecords();
                    pair.setRecord1Pos(ipos);
                    pair.setRecord2Pos(jpos);
                    pair.setSimilarity(max);
                    similarRecordPairs.add(pair);
                }
                HashSet<TextCluster> clusters = new HashSet<TextCluster>();
                for (SimilarRecords similarPair : similarRecordPairs) {
                    int i = similarPair.getRecord1Pos();
                    int j = similarPair.getRecord2Pos();
                    ArrayList<Integer> tempList = new ArrayList<Integer>();
                    TextCluster result = new TextCluster();
                    tempList.add(i);
                    tempList.add(j);
                    result.setCluster(tempList);
                    clusters.add(result);
                }
                HashSet clusterSet = new HashSet(clusters);
                Iterator clusterIterator = clusterSet.iterator();
                int clusterNum = 0;
                outputWriter.write("RecordId\tClusternum\n");
                while (clusterIterator.hasNext()) {
                    TextCluster output = (TextCluster)clusterIterator.next();
                    ArrayList<Integer> list = output.getCluster();
                    for (int i = 0; i < list.size(); ++i) {
                        outputWriter.write(this.idMap.get(list.get(i)) + "\t" + clusterNum + "\n");
                    }
                    ++clusterNum;
                }
                outputWriter.close();
                this.endTimeStamp = System.currentTimeMillis();
            } else {
                System.out.println("Please pass the path of the input");
            }
        }
        catch (Exception e) {
            System.out.println("Either file didn't exist or error while clustering");
            e.printStackTrace();
        }
    }

    public void printStatistics() {
        System.out.println("========== Text Clusterer - STATS ============");
        System.out.println(" Total time ~: " + (this.endTimeStamp - this.startTimestamp) + " ms");
        System.out.println(" Max memory:" + MemoryLogger.getInstance().getMaxMemory() + " mb ");
        System.out.println("=====================================");
    }

    private double calculateSimilarity(double[] tfIdfVector1, double[] tfIdfVector2) {
        double similarity = 0.0;
        for (int i = 0; i < tfIdfVector1.length; ++i) {
            similarity += tfIdfVector1[i] * tfIdfVector2[i];
        }
        return similarity;
    }

    private ArrayList<Record> loadInput(BufferedReader inputReader, boolean stemFlag, boolean stopWordFlag) {
        ArrayList<Record> records = new ArrayList<Record>();
        try {
            String currentLine;
            int i = 0;
            while ((currentLine = inputReader.readLine()) != null) {
                String[] line = currentLine.split("\t", -1);
                Record record = new Record();
                int recordId = Integer.parseInt(line[0]);
                record.setRecordId(recordId);
                String attribute = line[1].toLowerCase();
                attribute = attribute.replaceAll("[^a-zA-Z0-9]+", " ");
                if (stopWordFlag) {
                    StopWordAnalyzer analyzer = new StopWordAnalyzer();
                    attribute = analyzer.removeStopWords(attribute);
                }
                this.idMap.put(i, recordId);
                String[] words = attribute.split(" ");
                attribute = "";
                for (String word : words) {
                    if (stemFlag) {
                        word = this.stemmer.stem(word);
                    }
                    attribute = attribute + word + " ";
                    this.allWords.add(word);
                }
                record.setAttribute(attribute);
                records.add(record);
                ++i;
            }
            return records;
        }
        catch (Exception e) {
            e.printStackTrace();
            return records;
        }
    }

    private double FindTFIDF(String document, String term, ArrayList<Record> records) {
        double tf = this.FindTermFrequency(document, term);
        float idf = this.FindInverseDocumentFrequency(term, records);
        return tf * (double)idf;
    }

    private float FindInverseDocumentFrequency(String term, ArrayList<Record> records) {
        int occurance = 0;
        for (Record record : records) {
            if (!record.getAttribute().contains(term)) continue;
            ++occurance;
        }
        return (float)Math.log((float)occurance / (1.0f + (float)records.size()));
    }

    private double FindTermFrequency(String document, String term) {
        String[] words;
        int occurance = 0;
        for (String word : words = document.split(" ")) {
            if (!word.equalsIgnoreCase(term)) continue;
            ++occurance;
        }
        return (float)occurance / (float)words.length;
    }
}

