package edu.stanford.nlp.international.arabic.pipeline;

import edu.stanford.nlp.international.arabic.IBMArabicEscaper;
import edu.stanford.nlp.ling.SentenceUtils;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.trees.treebank.ConfigParser;
import edu.stanford.nlp.trees.treebank.Dataset;
import edu.stanford.nlp.trees.treebank.Mapper;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Pattern;

/* loaded from: input_file:edu/stanford/nlp/international/arabic/pipeline/IBMMTArabicDataset.class */
public class IBMMTArabicDataset implements Dataset {
    protected String outFileName;
    protected final Set<String> requiredOptions;
    private static Redwood.RedwoodChannels log = Redwood.channels(IBMMTArabicDataset.class);
    private static final Pattern utf8ArabicChart = Pattern.compile("[\u0600-ۿ]");
    protected Mapper lexMapper = null;
    protected final Pattern fileNameNormalizer = Pattern.compile("\\s+");
    protected final Set<String> configuredOptions = Generics.newHashSet();
    protected final StringBuilder toStringBuffer = new StringBuilder();
    protected final List<File> pathsToData = new ArrayList();
    protected final IBMArabicEscaper escaper = new IBMArabicEscaper(true);

    public IBMMTArabicDataset() {
        this.escaper.disableWarnings();
        this.requiredOptions = Generics.newHashSet();
        this.requiredOptions.add(ConfigParser.paramName);
        this.requiredOptions.add(ConfigParser.paramPath);
    }

    @Override // edu.stanford.nlp.trees.treebank.Dataset
    public void build() {
        LineNumberReader lineNumberReader = null;
        PrintWriter printWriter = null;
        String str = "";
        try {
            try {
                try {
                    printWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(this.outFileName), "UTF-8")));
                    for (File file : this.pathsToData) {
                        lineNumberReader = new LineNumberReader(new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8")));
                        str = file.getPath();
                        while (lineNumberReader.ready()) {
                            ArrayList<Word> untaggedList = SentenceUtils.toUntaggedList(lineNumberReader.readLine().split("\\s+"));
                            Iterator<Word> it = untaggedList.iterator();
                            while (it.hasNext()) {
                                Word next = it.next();
                                if (utf8ArabicChart.matcher(next.word()).find()) {
                                    next.setWord(this.escaper.apply(next.word()));
                                    next.setWord(this.lexMapper.map(null, next.word()));
                                }
                            }
                            printWriter.println(SentenceUtils.listToString(untaggedList));
                        }
                        this.toStringBuffer.append(String.format(" Read %d input lines from %s", Integer.valueOf(lineNumberReader.getLineNumber()), file.getPath()));
                    }
                    lineNumberReader.close();
                    if (printWriter != null) {
                        printWriter.close();
                    }
                } catch (UnsupportedEncodingException e) {
                    System.err.printf("%s: Filesystem does not support UTF-8 output\n", getClass().getName());
                    e.printStackTrace();
                    if (printWriter != null) {
                        printWriter.close();
                    }
                } catch (RuntimeException e2) {
                    System.err.printf("%s: Input sentence from %s contains token mapped to null (line %d)\n", getClass().getName(), str, Integer.valueOf(lineNumberReader.getLineNumber()));
                    e2.printStackTrace();
                    if (printWriter != null) {
                        printWriter.close();
                    }
                }
            } catch (FileNotFoundException e3) {
                System.err.printf("%s: Could not open %s for writing\n", getClass().getName(), this.outFileName);
                if (printWriter != null) {
                    printWriter.close();
                }
            } catch (IOException e4) {
                System.err.printf("%s: Error reading from %s (line %d)\n", getClass().getName(), str, Integer.valueOf(lineNumberReader.getLineNumber()));
                if (printWriter != null) {
                    printWriter.close();
                }
            }
        } catch (Throwable th) {
            if (printWriter != null) {
                printWriter.close();
            }
            throw th;
        }
    }

    @Override // edu.stanford.nlp.trees.treebank.Dataset
    public List<String> getFilenames() {
        ArrayList arrayList = new ArrayList();
        arrayList.add(this.outFileName);
        return arrayList;
    }

    public String toString() {
        return this.toStringBuffer.toString();
    }

    @Override // edu.stanford.nlp.trees.treebank.Dataset
    public boolean setOptions(Properties properties) {
        for (String str : properties.stringPropertyNames()) {
            String property = properties.getProperty(str);
            if (property == null) {
                System.err.printf("%s: Read parameter with null value (%s)\n", getClass().getName(), str);
            } else {
                this.configuredOptions.add(str);
                if (ConfigParser.matchPath.matcher(str).lookingAt()) {
                    this.pathsToData.add(new File(property));
                    this.configuredOptions.add(ConfigParser.paramPath);
                } else if (str.equals(ConfigParser.paramName)) {
                    this.outFileName = this.fileNameNormalizer.matcher(property.trim()).replaceAll("-");
                    this.toStringBuffer.append(String.format("Dataset Name: %s\n", property.trim()));
                }
            }
        }
        if (!this.configuredOptions.containsAll(this.requiredOptions)) {
            return false;
        }
        this.outFileName += ".txt";
        this.lexMapper = new DefaultLexicalMapper();
        return true;
    }
}
