package edu.stanford.nlp.process;

import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.SentenceUtils;
import edu.stanford.nlp.ling.tokensregex.SequenceMatchRules;
import edu.stanford.nlp.process.DocumentPreprocessor;
import java.io.BufferedReader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import junit.framework.TestCase;

/* loaded from: input_file:edu/stanford/nlp/process/DocumentPreprocessorTest.class */
public class DocumentPreprocessorTest extends TestCase {
    private static final String BASIC_XML_TEST = "<xml><text>The previous test was a lie.  I didn't make this test in my office; I made it at home.</text>\nMy home currently smells like dog vomit.\n<text apartment=\"stinky\">My dog puked everywhere after eating some carrots the other day.\n  Hopefully I have cleaned the last of it, though.</text>\n\nThis tests to see what happens on an empty tag:<text></text><text>It shouldn't include a blank sentence, but it should include this sentence.</text>this is madness...<text>no, this <text> is </text> NESTED!</text>This only prints 'no, this is' instead of 'no, this is NESTED'.  Doesn't do what i would expect, but it's consistent with the old behavior.</xml>";

    private static void runTest(String str, String[] strArr) {
        runTest(str, strArr, null, false);
    }

    private static void runTest(String str, String[] strArr, String[] strArr2, boolean z) {
        ArrayList arrayList = new ArrayList();
        DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(new BufferedReader(new StringReader(str)));
        if (strArr2 != null) {
            documentPreprocessor.setSentenceFinalPuncWords(strArr2);
        }
        if (z) {
            documentPreprocessor.setTokenizerFactory(null);
            documentPreprocessor.setSentenceDelimiter("\n");
        }
        Iterator<List<HasWord>> it = documentPreprocessor.iterator();
        while (it.hasNext()) {
            arrayList.add(SentenceUtils.listToString(it.next()));
        }
        assertEquals("Should be " + strArr.length + " sentences but got " + arrayList.size() + ": " + arrayList, strArr.length, arrayList.size());
        for (int i = 0; i < arrayList.size(); i++) {
            assertEquals("Failed on sentence " + i, strArr[i], (String) arrayList.get(i));
        }
    }

    public void testText() {
        runTest("This is a test of the preprocessor2.  It should split this text into sentences.  I like resting my feet on my desk.  Hopefully the people around my office don't hear me singing along to my music, and if they do, hopefully they aren't annoyed.  My test cases are probably terrifying looks into my psyche.", new String[]{"This is a test of the preprocessor2 .", "It should split this text into sentences .", "I like resting my feet on my desk .", "Hopefully the people around my office do n't hear me singing along to my music , and if they do , hopefully they are n't annoyed .", "My test cases are probably terrifying looks into my psyche ."});
    }

    public void testNearFinalPunctuation() {
        runTest("Mount. Annaguan", new String[]{"Mount .", "Annaguan"});
    }

    public void testNearFinalPunctuation2() {
        runTest("(I lied.)", new String[]{"-LRB- I lied . -RRB-"});
    }

    public void testSetSentencePunctWords() {
        runTest("This is a test of the preprocessor2... it should split this text into sentences? This should be a different sentence.This should be attached to the previous sentence, though.", new String[]{"This is a test of the preprocessor2 ...", "it should split this text into sentences ?", "This should be a different sentence.This should be attached to the previous sentence , though ."}, new String[]{".", "?", "!", "...", "\n"}, false);
    }

    public void testWhitespaceTokenization() {
        runTest("This is a whitespace tokenized test case . \n  This should be the second sentence    . \n \n  \n\n  This should be the third sentence .  \n  This should be one sentence . The period should not break it . \n This is the fifth sentence , with a weird period at the end.", new String[]{"This is a whitespace tokenized test case .", "This should be the second sentence .", "This should be the third sentence .", "This should be one sentence . The period should not break it .", "This is the fifth sentence , with a weird period at the end."}, null, true);
    }

    private static void compareXMLResults(String str, String str2, String... strArr) {
        ArrayList arrayList = new ArrayList();
        DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(new BufferedReader(new StringReader(str)), DocumentPreprocessor.DocType.XML);
        documentPreprocessor.setElementDelimiter(str2);
        Iterator<List<HasWord>> it = documentPreprocessor.iterator();
        while (it.hasNext()) {
            arrayList.add(SentenceUtils.listToString(it.next()));
        }
        assertEquals(strArr.length, arrayList.size());
        for (int i = 0; i < arrayList.size(); i++) {
            assertEquals(strArr[i], (String) arrayList.get(i));
        }
    }

    public void testXMLBasic() {
        compareXMLResults(BASIC_XML_TEST, SequenceMatchRules.TEXT_PATTERN_RULE_TYPE, "The previous test was a lie .", "I did n't make this test in my office ; I made it at home .", "My dog puked everywhere after eating some carrots the other day .", "Hopefully I have cleaned the last of it , though .", "It should n't include a blank sentence , but it should include this sentence .", "no , this is");
    }

    public void testXMLNoResults() {
        compareXMLResults(BASIC_XML_TEST, "zzzz", new String[0]);
    }

    public void testXMLElementInText() {
        compareXMLResults("<xml><wood>There are many trees in the woods</wood></xml>", "wood", "There are many trees in the woods");
    }

    public void testXMLElementNotInText() {
        compareXMLResults("<xml><wood>There are many trees in the forest</wood></xml>", "wood", "There are many trees in the forest");
    }

    public void testPlainTextIterator() {
        String[] strArr = {"This", "is", "a", "one", "line", "test", "."};
        DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(new BufferedReader(new StringReader("This is a one line test . \n")));
        documentPreprocessor.setTokenizerFactory(null);
        documentPreprocessor.setSentenceDelimiter("\n");
        Iterator<List<HasWord>> it = documentPreprocessor.iterator();
        assertTrue(it.hasNext());
        assertTrue(it.hasNext());
        List<HasWord> next = it.next();
        assertEquals(strArr.length, next.size());
        for (int i = 0; i < strArr.length; i++) {
            assertEquals(strArr[i], next.get(i).word());
        }
        assertFalse(it.hasNext());
        assertFalse(it.hasNext());
        try {
            it.next();
            throw new AssertionError("iterator.next() should have blown up");
        } catch (NoSuchElementException e) {
            assertFalse(it.hasNext());
        }
    }
}
