/*
 * Decompiled with CFR 0.152.
 */
package edu.msu.cme.rdp.readseq.utils;

import edu.msu.cme.rdp.readseq.readers.IndexedSeqReader;
import edu.msu.cme.rdp.readseq.readers.Sequence;
import edu.msu.cme.rdp.readseq.readers.SequenceReader;
import edu.msu.cme.rdp.readseq.writers.FastaWriter;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.PosixParser;

public class RmDupSeqs {
    private static final Options options = new Options();

    public static void filterDuplicates(String inFile, String outFile, int length, boolean debug) throws IOException {
        Sequence seq;
        HashMap<String, String> idSet = new HashMap<String, String>();
        IndexedSeqReader reader = new IndexedSeqReader(new File(inFile));
        BufferedWriter outWriter = new BufferedWriter(new FileWriter(new File(outFile)));
        Set<String> allseqIDset = reader.getSeqIdSet();
        if (debug) {
            System.out.println("ID\tdescription\tcontained_by_ID\tdescription");
        }
        for (String id : allseqIDset) {
            seq = reader.readSeq(id);
            boolean dup = false;
            HashSet<String> tempdupSet = new HashSet<String>();
            for (String exID : idSet.keySet()) {
                String exSeq = (String)idSet.get(exID);
                if (exSeq.length() >= seq.getSeqString().length()) {
                    if (!exSeq.contains(seq.getSeqString())) continue;
                    dup = true;
                    if (!debug) break;
                    Sequence temp = reader.readSeq(exID);
                    System.out.println(id + "\t" + seq.getDesc() + "\t" + exID + "\t" + temp.getDesc());
                    break;
                }
                if (!seq.getSeqString().contains(exSeq)) continue;
                tempdupSet.add(exID);
            }
            if (!dup) {
                idSet.put(id, seq.getSeqString());
            }
            for (String dupid : tempdupSet) {
                idSet.remove(dupid);
                if (!debug) continue;
                Sequence temp = reader.readSeq(dupid);
                System.out.println(dupid + "\t" + temp.getDesc() + "\t" + id + "\t" + seq.getDesc());
            }
        }
        for (String id : idSet.keySet()) {
            seq = reader.readSeq(id);
            if (seq.getSeqString().length() < length) continue;
            outWriter.write(">" + id + "\t" + seq.getDesc() + "\n" + seq.getSeqString() + "\n");
        }
        reader.close();
        outWriter.close();
    }

    public static void filterByLength(String inFile, String outFile, int length) throws IOException {
        SequenceReader seqReader = new SequenceReader(new File(inFile));
        FastaWriter outWriter = new FastaWriter(outFile);
        Sequence seq = null;
        while ((seq = seqReader.readNextSequence()) != null) {
            if (seq.getSeqString().length() < length) continue;
            outWriter.writeSeq(seq.getSeqName(), seq.getDesc(), seq.getSeqString());
        }
        seqReader.close();
        outWriter.close();
    }

    public static void main(String[] args) throws Exception {
        String outFile;
        String inFile;
        int length = 0;
        boolean debug = false;
        boolean removeDuplicates = false;
        try {
            CommandLine line = new PosixParser().parse(options, args);
            if (line.hasOption("duplicates")) {
                removeDuplicates = true;
            }
            if (line.hasOption("min_seq_length")) {
                length = Integer.parseInt(line.getOptionValue("min_seq_length"));
            }
            if (!line.hasOption("infile")) {
                throw new Exception("infile is required");
            }
            inFile = line.getOptionValue("infile");
            if (!line.hasOption("outfile")) {
                throw new Exception("outfile is required");
            }
            outFile = line.getOptionValue("outfile");
            if (line.hasOption("debug")) {
                debug = true;
            }
        }
        catch (Exception e) {
            new HelpFormatter().printHelp(120, "RmRedundantSeqs [options]", "", options, "");
            System.err.println("ERROR: " + e.getMessage());
            return;
        }
        if (!removeDuplicates) {
            RmDupSeqs.filterByLength(inFile, outFile, length);
        } else {
            RmDupSeqs.filterDuplicates(inFile, outFile, length, debug);
        }
    }

    static {
        options.addOption("i", "infile", true, "input fasta file");
        options.addOption("o", "outfile", true, "output fasta file");
        options.addOption("l", "min_seq_length", true, "filter sequence by minimum sequence length, default is 0");
        options.addOption("d", "duplicates", false, "remove identical sequence, or sequence contained by another sequence");
        options.addOption("g", "debug", false, "output the ids that are contained by other sequences to standard out");
    }
}

