package edu.stanford.nlp.wordseg;

import edu.stanford.nlp.fsm.DFSA;
import edu.stanford.nlp.fsm.DFSAState;
import edu.stanford.nlp.fsm.DFSATransition;
import edu.stanford.nlp.io.EncodingPrintWriter;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.SentenceUtils;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.process.WordSegmenter;
import edu.stanford.nlp.semgraph.semgrex.ssurgeon.AddDep;
import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;
import edu.stanford.nlp.wordseg.ChineseStringUtils;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Pattern;

/* loaded from: input_file:edu/stanford/nlp/wordseg/MaxMatchSegmenter.class */
public class MaxMatchSegmenter implements WordSegmenter {
    private static final boolean DEBUG = false;
    private static Redwood.RedwoodChannels logger;
    private static final int maxLength = 10;
    private List<DFSAState<Word, Integer>> states;
    private static final Pattern chineseStartChars;
    private static final Pattern chineseEndChars;
    private static final Pattern chineseChars;
    private static final Pattern excludeChars;
    private static final long serialVersionUID = 8263734344886904724L;
    static final /* synthetic */ boolean $assertionsDisabled;
    private final Set<String> words = Generics.newHashSet();
    private int len = -1;
    private int edgesNb = 0;
    private DFSA<Word, Integer> lattice = null;

    /* loaded from: input_file:edu/stanford/nlp/wordseg/MaxMatchSegmenter$MatchHeuristic.class */
    public enum MatchHeuristic {
        MINWORDS,
        MAXWORDS,
        MAXLEN
    }

    @Override // edu.stanford.nlp.process.WordSegmenter
    public void initializeTraining(double d) {
    }

    @Override // edu.stanford.nlp.process.WordSegmenter
    public void train(Collection<Tree> collection) {
        Iterator<Tree> it = collection.iterator();
        while (it.hasNext()) {
            train(it.next());
        }
    }

    @Override // edu.stanford.nlp.process.WordSegmenter
    public void train(Tree tree) {
        train((List<TaggedWord>) tree.taggedYield());
    }

    @Override // edu.stanford.nlp.process.WordSegmenter
    public void train(List<TaggedWord> list) {
        for (TaggedWord taggedWord : list) {
            if (taggedWord.word().length() <= 10) {
                addStringToLexicon(taggedWord.word());
            }
        }
    }

    @Override // edu.stanford.nlp.process.WordSegmenter
    public void finishTraining() {
    }

    @Override // edu.stanford.nlp.process.WordSegmenter
    public void loadSegmenter(String str) {
        addLexicon(str);
    }

    @Override // edu.stanford.nlp.process.WordSegmenter
    public List<HasWord> segment(String str) {
        buildSegmentationLattice(str);
        ArrayList<Word> maxMatchSegmentation = maxMatchSegmentation();
        printlnErr("raw output: " + SentenceUtils.listToString(maxMatchSegmentation));
        ArrayList<Word> postProcessSentence = postProcessSentence(maxMatchSegmentation);
        printlnErr("processed output: " + SentenceUtils.listToString(postProcessSentence));
        String postProcessingAnswer = new ChineseStringUtils.CTPPostProcessor().postProcessingAnswer(postProcessSentence.toString(), false);
        printlnErr("Sighan2005 output: " + postProcessingAnswer);
        String[] split = postProcessingAnswer.split("\\s+");
        ArrayList arrayList = new ArrayList();
        for (String str2 : split) {
            arrayList.add(new Word(str2));
        }
        return new ArrayList(arrayList);
    }

    private void addStringToLexicon(String str) {
        if (str.equals("")) {
            logger.warn("WARNING: blank line in lexicon");
            return;
        }
        if (str.contains(AddDep.ATOM_DELIMITER)) {
            logger.warn("WARNING: word with space in lexicon");
        } else if (excludeChar(str)) {
            printlnErr("skipping word: " + str);
        } else {
            this.words.add(str);
        }
    }

    private void addLexicon(String str) {
        try {
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(str), "UTF-8"));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    return;
                } else {
                    addStringToLexicon(readLine);
                }
            }
        } catch (FileNotFoundException e) {
            logger.error("Lexicon not found: " + str);
            System.exit(-1);
        } catch (IOException e2) {
            logger.error("IO error while reading: " + str, e2);
            throw new RuntimeException(e2);
        }
    }

    private void buildSegmentationLattice(String str) {
        this.edgesNb = 0;
        this.len = str.length();
        this.states = new ArrayList();
        this.lattice = new DFSA<>("wordLattice");
        for (int i = 0; i <= str.length(); i++) {
            this.states.add(new DFSAState<>(Integer.valueOf(i), this.lattice));
        }
        this.lattice.setInitialState(this.states.get(0));
        this.states.get(this.len).setAccepting(true);
        for (int i2 = 0; i2 < this.len; i2++) {
            int i3 = this.len;
            while (i3 > i2) {
                String substring = str.substring(i2, i3);
                if (!$assertionsDisabled && substring.length() <= 0) {
                    throw new AssertionError();
                }
                boolean z = i2 + 1 == i3;
                boolean contains = this.words.contains(substring);
                if (contains || z) {
                    this.states.get(i2).addTransition(new DFSATransition<>(null, this.states.get(i2), this.states.get(i3), new Word(substring), null, contains ? 1.0d : 100.0d));
                    this.edgesNb++;
                }
                i3--;
            }
        }
    }

    public ArrayList<Word> maxMatchSegmentation() {
        return segmentWords(MatchHeuristic.MINWORDS);
    }

    public ArrayList<Word> segmentWords(MatchHeuristic matchHeuristic) throws UnsupportedOperationException {
        if (this.lattice == null || this.len < 0) {
            throw new UnsupportedOperationException("segmentWords must be run first");
        }
        ArrayList arrayList = new ArrayList();
        double[] dArr = new double[this.len + 1];
        ArrayList arrayList2 = new ArrayList();
        for (int i = 0; i < this.len + 1; i++) {
            arrayList2.add(null);
        }
        dArr[0] = 0.0d;
        for (int i2 = 1; i2 <= this.len; i2++) {
            dArr[i2] = Double.MAX_VALUE;
        }
        for (int i3 = 0; i3 < this.len; i3++) {
            for (DFSATransition<Word, Integer> dFSATransition : this.states.get(i3).transitions()) {
                DFSAState<Word, Integer> target = dFSATransition.getTarget();
                double score = dFSATransition.score();
                int intValue = target.stateID().intValue();
                if (matchHeuristic != MatchHeuristic.MINWORDS) {
                    if (matchHeuristic != MatchHeuristic.MAXWORDS) {
                        throw new UnsupportedOperationException("unimplemented heuristic");
                    }
                    if (dArr[i3] + 1.0d < dArr[intValue]) {
                        dArr[intValue] = dArr[i3] - score;
                        arrayList2.set(intValue, dFSATransition);
                    }
                } else if (dArr[i3] + 1.0d < dArr[intValue]) {
                    dArr[intValue] = dArr[i3] + score;
                    arrayList2.set(intValue, dFSATransition);
                }
            }
        }
        int i4 = this.len;
        while (true) {
            int i5 = i4;
            if (i5 <= 0) {
                return new ArrayList<>(arrayList);
            }
            DFSATransition dFSATransition2 = (DFSATransition) arrayList2.get(i5);
            DFSAState source = dFSATransition2.getSource();
            Word word = (Word) dFSATransition2.getInput();
            if (!word.word().equals(AddDep.ATOM_DELIMITER)) {
                arrayList.add(0, word);
            }
            i4 = ((Integer) source.stateID()).intValue();
        }
    }

    public ArrayList<Word> greedilySegmentWords(String str) {
        ArrayList arrayList = new ArrayList();
        int length = str.length();
        int i = 0;
        while (i < length) {
            int min = Math.min(length, i + 10);
            while (true) {
                if (min <= i + 1) {
                    break;
                }
                String substring = str.substring(i, min);
                if (this.words.contains(substring)) {
                    arrayList.add(new Word(substring));
                    break;
                }
                min--;
            }
            if (min == i + 1) {
                arrayList.add(new Word(new String(new char[]{str.charAt(i)})));
                i++;
            } else {
                i = min;
            }
        }
        return new ArrayList<>(arrayList);
    }

    public static void main(String[] strArr) {
        String listToString;
        Properties argsToProperties = StringUtils.argsToProperties(strArr);
        SeqClassifierFlags seqClassifierFlags = new SeqClassifierFlags(argsToProperties);
        MaxMatchSegmenter maxMatchSegmenter = new MaxMatchSegmenter();
        String property = argsToProperties.getProperty("lexicon");
        if (property != null) {
            maxMatchSegmenter.addLexicon(property);
        } else {
            logger.error("Error: no lexicon file!");
            System.exit(1);
        }
        Sighan2005DocumentReaderAndWriter sighan2005DocumentReaderAndWriter = new Sighan2005DocumentReaderAndWriter();
        sighan2005DocumentReaderAndWriter.init(seqClassifierFlags);
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(System.in));
        PrintWriter printWriter = new PrintWriter(System.out);
        int i = 0;
        while (true) {
            i++;
            logger.info("line: " + i);
            try {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    break;
                }
                if (argsToProperties.getProperty("greedy") != null) {
                    listToString = SentenceUtils.listToString(maxMatchSegmenter.greedilySegmentWords(readLine));
                } else if (argsToProperties.getProperty("maxwords") != null) {
                    maxMatchSegmenter.buildSegmentationLattice(readLine);
                    listToString = SentenceUtils.listToString(maxMatchSegmenter.segmentWords(MatchHeuristic.MAXWORDS));
                } else {
                    maxMatchSegmenter.buildSegmentationLattice(readLine);
                    listToString = SentenceUtils.listToString(maxMatchSegmenter.maxMatchSegmentation());
                }
                Iterator<List<CoreLabel>> iterator = sighan2005DocumentReaderAndWriter.getIterator(new StringReader(listToString));
                while (iterator.hasNext()) {
                    sighan2005DocumentReaderAndWriter.printAnswers(iterator.next(), printWriter);
                }
            } catch (IOException e) {
            }
        }
        printWriter.flush();
    }

    private static void printlnErr(String str) {
        EncodingPrintWriter.err.println(str, "UTF-8");
    }

    private static ArrayList<Word> postProcessSentence(ArrayList<Word> arrayList) {
        ArrayList arrayList2 = new ArrayList();
        Iterator<Word> it = arrayList.iterator();
        while (it.hasNext()) {
            Word next = it.next();
            if (arrayList2.size() > 0) {
                String word = ((Word) arrayList2.get(arrayList2.size() - 1)).toString();
                String word2 = next.toString();
                String substring = word.substring(word.length() - 1);
                String substring2 = word2.substring(0, 1);
                if (!isChinese(substring) && !isChinese(substring2)) {
                    arrayList2.set(arrayList2.size() - 1, new Word(word + word2));
                }
            }
            arrayList2.add(next);
        }
        return new ArrayList<>(arrayList2);
    }

    private static boolean startsWithChinese(String str) {
        return chineseStartChars.matcher(str).matches();
    }

    private static boolean endsWithChinese(String str) {
        return chineseEndChars.matcher(str).matches();
    }

    private static boolean isChinese(String str) {
        return chineseChars.matcher(str).matches();
    }

    private static boolean excludeChar(String str) {
        return excludeChars.matcher(str).matches();
    }

    static {
        $assertionsDisabled = !MaxMatchSegmenter.class.desiredAssertionStatus();
        logger = Redwood.channels(MaxMatchSegmenter.class);
        chineseStartChars = Pattern.compile("^[一-鿿]");
        chineseEndChars = Pattern.compile("[一-鿿]$");
        chineseChars = Pattern.compile("[一-鿿]");
        excludeChars = Pattern.compile("[0-9０-９一二三四五六七八九十零〇百千万亿兩○◯〡-〩〸-〺-#$%&'*+/@_－＃＄％＆＇＊＋／＠＿]");
    }
}
