package edu.stanford.nlp.parser.lexparser;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.NumberRangesFileFilter;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.StringLabel;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.parser.lexparser.ChineseCharacterBasedLexicon;
import edu.stanford.nlp.process.WordSegmenter;
import edu.stanford.nlp.semgraph.semgrex.ssurgeon.AddDep;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.stats.Distribution;
import edu.stanford.nlp.stats.EquivalenceClassEval;
import edu.stanford.nlp.trees.MemoryTreebank;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeToBracketProcessor;
import edu.stanford.nlp.trees.TreeTransformer;
import edu.stanford.nlp.trees.Trees;
import edu.stanford.nlp.trees.WordCatConstituent;
import edu.stanford.nlp.trees.WordCatEqualityChecker;
import edu.stanford.nlp.trees.WordCatEquivalenceClasser;
import edu.stanford.nlp.trees.international.pennchinese.RadicalMap;
import edu.stanford.nlp.trees.tregex.gui.FileTreeModel;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.HashIndex;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Writer;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

/* loaded from: input_file:edu/stanford/nlp/parser/lexparser/ChineseCharacterBasedLexiconTraining.class */
public class ChineseCharacterBasedLexiconTraining {
    private static Redwood.RedwoodChannels log;
    protected static final NumberFormat formatter;
    static final /* synthetic */ boolean $assertionsDisabled;

    public static void printStats(Collection<Tree> collection, PrintWriter printWriter) {
        ClassicCounter classicCounter = new ClassicCounter();
        ClassicCounter classicCounter2 = new ClassicCounter();
        ClassicCounter classicCounter3 = new ClassicCounter();
        int i = 0;
        Iterator<Tree> it = collection.iterator();
        while (it.hasNext()) {
            i++;
            for (TaggedWord taggedWord : it.next().taggedYield()) {
                String word = taggedWord.word();
                if (!word.equals(".$.")) {
                    classicCounter2.incrementCount(taggedWord);
                    classicCounter.incrementCount(Integer.valueOf(word.length()));
                    int length = word.length();
                    for (int i2 = 0; i2 < length; i2++) {
                        classicCounter3.incrementCount(ChineseCharacterBasedLexicon.Symbol.cannonicalSymbol(word.charAt(i2)));
                    }
                    classicCounter3.incrementCount(ChineseCharacterBasedLexicon.Symbol.END_WORD);
                }
            }
        }
        Set keysBelow = Counters.keysBelow(classicCounter3, 1.5d);
        Set keysBelow2 = Counters.keysBelow(classicCounter2, 1.5d);
        ClassicCounter classicCounter4 = new ClassicCounter();
        Iterator it2 = keysBelow2.iterator();
        while (it2.hasNext()) {
            classicCounter4.incrementCount(((TaggedWord) it2.next()).tag());
        }
        Distribution distribution = Distribution.getDistribution(classicCounter4);
        ClassicCounter classicCounter5 = new ClassicCounter();
        Iterator it3 = keysBelow.iterator();
        while (it3.hasNext()) {
            classicCounter5.incrementCount(Character.valueOf(RadicalMap.getRadical(((ChineseCharacterBasedLexicon.Symbol) it3.next()).getCh())));
        }
        Distribution distribution2 = Distribution.getDistribution(classicCounter5);
        Distribution distribution3 = Distribution.getDistribution(classicCounter);
        DecimalFormat decimalFormat = new DecimalFormat("##.##%");
        printWriter.println("There are " + keysBelow.size() + " singleton chars out of " + ((int) classicCounter3.totalCount()) + " tokens and " + classicCounter3.size() + " types found in " + i + " trees.");
        printWriter.println("Thus singletonChars comprise " + decimalFormat.format(keysBelow.size() / classicCounter3.totalCount()) + " of tokens and " + decimalFormat.format(keysBelow.size() / classicCounter3.size()) + " of types.");
        printWriter.println();
        printWriter.println("There are " + keysBelow2.size() + " singleton words out of " + ((int) classicCounter2.totalCount()) + " tokens and " + classicCounter2.size() + " types.");
        printWriter.println("Thus singletonWords comprise " + decimalFormat.format(keysBelow2.size() / classicCounter2.totalCount()) + " of tokens and " + decimalFormat.format(keysBelow2.size() / classicCounter2.size()) + " of types.");
        printWriter.println();
        printWriter.println("Distribution over singleton word POS:");
        printWriter.println(distribution.toString());
        printWriter.println();
        printWriter.println("Distribution over singleton char radicals:");
        printWriter.println(distribution2.toString());
        printWriter.println();
        printWriter.println("Distribution over word length:");
        printWriter.println(distribution3);
    }

    /* JADX WARN: Multi-variable type inference failed */
    public static void main(String[] strArr) throws IOException {
        List list;
        Tree transformTree;
        MemoryTreebank memoryTreebank;
        MemoryTreebank memoryTreebank2;
        Map newHashMap = Generics.newHashMap();
        newHashMap.put("-parser", 3);
        newHashMap.put("-lex", 3);
        newHashMap.put("-test", 2);
        newHashMap.put("-out", 1);
        newHashMap.put("-lengthPenalty", 1);
        newHashMap.put("-penaltyType", 1);
        newHashMap.put("-maxLength", 1);
        newHashMap.put("-stats", 2);
        Map<String, String[]> argsToMap = StringUtils.argsToMap(strArr, newHashMap);
        boolean containsKey = argsToMap.containsKey("-eval");
        PrintWriter printWriter = argsToMap.containsKey("-out") ? new PrintWriter((Writer) new OutputStreamWriter(new FileOutputStream(argsToMap.get("-out")[0]), FileTreeModel.DEFAULT_CHINESE_ENCODING), true) : null;
        log.info("ChineseCharacterBasedLexicon called with args:");
        ChineseTreebankParserParams chineseTreebankParserParams = new ChineseTreebankParserParams();
        for (int i = 0; i < strArr.length; i++) {
            chineseTreebankParserParams.setOptionFlag(strArr, i);
            log.info(AddDep.ATOM_DELIMITER + strArr[i]);
        }
        log.info(new Object[0]);
        Options options = new Options(chineseTreebankParserParams);
        if (argsToMap.containsKey("-stats")) {
            String[] strArr2 = argsToMap.get("-stats");
            MemoryTreebank memoryTreebank3 = options.tlpParams.memoryTreebank();
            memoryTreebank3.loadPath(new File(strArr2[0]), new NumberRangesFileFilter(strArr2[1], false));
            log.info("Done reading trees.");
            if (argsToMap.containsKey("-annotate")) {
                memoryTreebank2 = new MemoryTreebank();
                TreeAnnotator treeAnnotator = new TreeAnnotator(chineseTreebankParserParams.headFinder(), chineseTreebankParserParams, options);
                Iterator<Tree> it = memoryTreebank3.iterator();
                while (it.hasNext()) {
                    memoryTreebank2.add(treeAnnotator.transformTree(it.next()));
                }
                log.info("Done annotating trees.");
            } else {
                memoryTreebank2 = memoryTreebank3;
            }
            printStats(memoryTreebank2, printWriter);
            System.exit(0);
        }
        if (argsToMap.containsKey("-norm")) {
            options.testOptions.lengthNormalization = true;
        }
        int parseInt = argsToMap.containsKey("-maxLength") ? Integer.parseInt(argsToMap.get("-maxLength")[0]) : 1000000;
        options.testOptions.maxLength = 120;
        boolean containsKey2 = argsToMap.containsKey("-combo");
        if (containsKey2) {
            chineseTreebankParserParams.useCharacterBasedLexicon = true;
            options.testOptions.maxSpanForTags = 10;
            options.doDep = false;
            options.dcTags = false;
        }
        LexicalizedParser lexicalizedParser = null;
        Lexicon lexicon = null;
        if (argsToMap.containsKey("-parser")) {
            String[] strArr3 = argsToMap.get("-parser");
            if (strArr3.length > 1) {
                lexicalizedParser = LexicalizedParser.trainFromTreebank(strArr3[0], new NumberRangesFileFilter(strArr3[1], false), options);
                if (strArr3.length == 3) {
                    String str = strArr3[2];
                    log.info("Writing parser in serialized format to file " + str + AddDep.ATOM_DELIMITER);
                    System.err.flush();
                    ObjectOutputStream writeStreamFromString = IOUtils.writeStreamFromString(str);
                    writeStreamFromString.writeObject(lexicalizedParser);
                    writeStreamFromString.close();
                    log.info("done.");
                }
            } else {
                lexicalizedParser = LexicalizedParser.loadModel(strArr3[0], options, new String[0]);
            }
            lexicon = lexicalizedParser.getLexicon();
            options = lexicalizedParser.getOp();
            chineseTreebankParserParams = (ChineseTreebankParserParams) options.tlpParams;
        }
        if (argsToMap.containsKey("-rad")) {
            chineseTreebankParserParams.useUnknownCharacterModel = true;
        }
        if (argsToMap.containsKey("-lengthPenalty")) {
            chineseTreebankParserParams.lengthPenalty = Double.parseDouble(argsToMap.get("-lengthPenalty")[0]);
        }
        if (argsToMap.containsKey("-penaltyType")) {
            chineseTreebankParserParams.penaltyType = Integer.parseInt(argsToMap.get("-penaltyType")[0]);
        }
        if (argsToMap.containsKey("-lex")) {
            String[] strArr4 = argsToMap.get("-lex");
            if (strArr4.length > 1) {
                lexicon = chineseTreebankParserParams.lex(options, new HashIndex(), new HashIndex());
                MemoryTreebank memoryTreebank4 = options.tlpParams.memoryTreebank();
                memoryTreebank4.loadPath(new File(strArr4[0]), new NumberRangesFileFilter(strArr4[1], false));
                log.info("Done reading trees.");
                if (argsToMap.containsKey("-annotate")) {
                    memoryTreebank = new MemoryTreebank();
                    TreeAnnotator treeAnnotator2 = new TreeAnnotator(chineseTreebankParserParams.headFinder(), chineseTreebankParserParams, options);
                    Iterator<Tree> it2 = memoryTreebank4.iterator();
                    while (it2.hasNext()) {
                        memoryTreebank.add(treeAnnotator2.transformTree(it2.next()));
                    }
                    log.info("Done annotating trees.");
                } else {
                    memoryTreebank = memoryTreebank4;
                }
                lexicon.initializeTraining(memoryTreebank.size());
                lexicon.train(memoryTreebank);
                lexicon.finishTraining();
                log.info("Done training lexicon.");
                if (strArr4.length == 3) {
                    String str2 = strArr4.length == 3 ? strArr4[2] : "parsers/chineseCharLex.ser.gz";
                    log.info("Writing lexicon in serialized format to file " + str2 + AddDep.ATOM_DELIMITER);
                    System.err.flush();
                    ObjectOutputStream writeStreamFromString2 = IOUtils.writeStreamFromString(str2);
                    writeStreamFromString2.writeObject(lexicon);
                    writeStreamFromString2.close();
                    log.info("done.");
                }
            } else {
                String str3 = strArr4.length == 1 ? strArr4[0] : "parsers/chineseCharLex.ser.gz";
                log.info("Reading Lexicon from file " + str3);
                ObjectInputStream readStreamFromString = IOUtils.readStreamFromString(str3);
                try {
                    lexicon = (Lexicon) readStreamFromString.readObject();
                    readStreamFromString.close();
                } catch (ClassNotFoundException e) {
                    throw new RuntimeException("Bad serialized file: " + str3);
                }
            }
        }
        if (argsToMap.containsKey("-test")) {
            boolean z = chineseTreebankParserParams.segment;
            boolean z2 = lexicalizedParser != null;
            if (!$assertionsDisabled && !z2 && !z) {
                throw new AssertionError();
            }
            WordSegmenter wordSegmenter = z ? (WordSegmenter) lexicon : null;
            String[] strArr5 = argsToMap.get("-test");
            MemoryTreebank memoryTreebank5 = options.tlpParams.memoryTreebank();
            memoryTreebank5.loadPath(new File(strArr5[0]), new NumberRangesFileFilter(strArr5[1], false));
            TreeTransformer subcategoryStripper = options.tlpParams.subcategoryStripper();
            TreeTransformer collinizer = chineseTreebankParserParams.collinizer();
            WordCatEquivalenceClasser wordCatEquivalenceClasser = new WordCatEquivalenceClasser();
            WordCatEqualityChecker wordCatEqualityChecker = new WordCatEqualityChecker();
            EquivalenceClassEval equivalenceClassEval = new EquivalenceClassEval(wordCatEquivalenceClasser, wordCatEqualityChecker, "basic");
            EquivalenceClassEval equivalenceClassEval2 = new EquivalenceClassEval(wordCatEquivalenceClasser, wordCatEqualityChecker, "collinized");
            ArrayList arrayList = new ArrayList(3);
            boolean z3 = false;
            if (z) {
                arrayList.add("word");
                if (chineseTreebankParserParams.segmentMarkov && !z2) {
                    arrayList.add(WordCatConstituent.tagType);
                    z3 = true;
                }
            }
            if (z2) {
                arrayList.add(WordCatConstituent.tagType);
                arrayList.add(WordCatConstituent.catType);
                if (containsKey2) {
                    arrayList.add("word");
                    z3 = true;
                }
            }
            TreeToBracketProcessor treeToBracketProcessor = new TreeToBracketProcessor(arrayList);
            log.info("Testing...");
            Iterator<Tree> it3 = memoryTreebank5.iterator();
            while (it3.hasNext()) {
                Tree firstChild = it3.next().firstChild();
                List yieldHasWord = firstChild.yieldHasWord();
                if (yieldHasWord.size() > parseInt) {
                    log.info("Skipping sentence; too long: " + yieldHasWord.size());
                } else {
                    log.info("Processing sentence; length: " + yieldHasWord.size());
                    if (z) {
                        StringBuilder sb = new StringBuilder();
                        Iterator it4 = yieldHasWord.iterator();
                        while (it4.hasNext()) {
                            sb.append(((StringLabel) ((HasWord) it4.next())).value());
                        }
                        list = wordSegmenter.segment(sb.toString());
                    } else {
                        list = yieldHasWord;
                    }
                    if (z2) {
                        transformTree = lexicalizedParser.parseTree(list);
                        if (transformTree == null) {
                            throw new RuntimeException("PARSER RETURNED NULL!!!");
                        }
                    } else {
                        transformTree = subcategoryStripper.transformTree(Trees.toFlatTree(list));
                    }
                    if (printWriter != null) {
                        if (!z2) {
                            Iterator it5 = list.iterator();
                            while (true) {
                                printWriter.print(((Word) it5.next()).word());
                                if (!it5.hasNext()) {
                                    break;
                                } else {
                                    printWriter.print(AddDep.ATOM_DELIMITER);
                                }
                            }
                        } else {
                            transformTree.pennPrint(printWriter);
                        }
                        printWriter.println();
                    }
                    if (containsKey) {
                        Collection allBrackets = treeToBracketProcessor.allBrackets(transformTree);
                        Collection allBrackets2 = treeToBracketProcessor.allBrackets(firstChild);
                        if (z3) {
                            allBrackets.addAll(TreeToBracketProcessor.commonWordTagTypeBrackets(transformTree, firstChild));
                            allBrackets2.addAll(TreeToBracketProcessor.commonWordTagTypeBrackets(firstChild, transformTree));
                        }
                        equivalenceClassEval.eval(allBrackets, allBrackets2);
                        System.out.println("\nScores:");
                        equivalenceClassEval.displayLast();
                        Tree transformTree2 = collinizer.transformTree(transformTree);
                        Tree transformTree3 = collinizer.transformTree(firstChild);
                        Collection allBrackets3 = treeToBracketProcessor.allBrackets(transformTree2);
                        Collection allBrackets4 = treeToBracketProcessor.allBrackets(transformTree3);
                        if (z3) {
                            allBrackets3.addAll(TreeToBracketProcessor.commonWordTagTypeBrackets(transformTree2, transformTree3));
                            allBrackets4.addAll(TreeToBracketProcessor.commonWordTagTypeBrackets(transformTree3, transformTree2));
                        }
                        equivalenceClassEval2.eval(allBrackets3, allBrackets4);
                        System.out.println("\nCollinized scores:");
                        equivalenceClassEval2.displayLast();
                        System.out.println();
                    }
                }
            }
            if (containsKey) {
                equivalenceClassEval.display();
                System.out.println();
                equivalenceClassEval2.display();
            }
        }
    }

    static {
        $assertionsDisabled = !ChineseCharacterBasedLexiconTraining.class.desiredAssertionStatus();
        log = Redwood.channels(ChineseCharacterBasedLexiconTraining.class);
        formatter = new DecimalFormat("0.000");
    }
}
