Modifier and Type | Class and Description |
---|---|
static class |
RobustTokenizer.AbbreviationMap |
static class |
RobustTokenizer.WordToken |
nextToken
Constructor and Description |
---|
RobustTokenizer(boolean caseInsensitive,
String buffer) |
RobustTokenizer(String buffer) |
Modifier and Type | Method and Description |
---|---|
static String |
block(String s)
parens
|
int |
countNewLines(String s,
int start,
int end) |
protected Word |
getNext()
Internally fetches the next token.
|
static boolean |
isAcronym(String s) |
static boolean |
isDigitSeq(String s) |
static boolean |
isEmail(String s) |
static boolean |
isSgml(String s) |
static boolean |
isSlashDate(String s) |
static boolean |
isUrl(String s) |
static void |
main(String[] argv) |
static String |
oneOrMore(String s)
one or more
|
static String |
or(String s1,
String s2)
any of the two
|
static String |
or(String s1,
String s2,
String s3)
any of the three
|
static String |
or(String s1,
String s2,
String s3,
String s4)
any of the four
|
static String |
or(String s1,
String s2,
String s3,
String s4,
String s5)
any of the five
|
static String |
or(String s1,
String s2,
String s3,
String s4,
String s5,
String s6)
any of the six
|
static String |
or(String s1,
String s2,
String s3,
String s4,
String s5,
String s6,
String s7)
any of the seven
|
static String |
or(String s1,
String s2,
String s3,
String s4,
String s5,
String s6,
String s7,
String s8)
any of the eight
|
static String |
or(String s1,
String s2,
String s3,
String s4,
String s5,
String s6,
String s7,
String s8,
String s9)
any of the nine
|
static String |
or(String s1,
String s2,
String s3,
String s4,
String s5,
String s6,
String s7,
String s8,
String s9,
String s10) |
static String |
or(String s1,
String s2,
String s3,
String s4,
String s5,
String s6,
String s7,
String s8,
String s9,
String s10,
String s11) |
static String |
or(String s1,
String s2,
String s3,
String s4,
String s5,
String s6,
String s7,
String s8,
String s9,
String s10,
String s11,
String s12) |
protected List<RobustTokenizer.WordToken> |
postprocess(List<RobustTokenizer.WordToken> tokens)
Redefine this method to implement additional domain-specific tokenization rules
|
static String |
range(String s)
any in the set
|
static String |
rangeNot(String s)
not
|
String |
tokenizeText()
Tokenizes and adds blank spaces were needed between each token
|
Word[] |
tokenizeToWords()
Smart tokenization storing the output in an array of CoreLabel
Sets the following fields:
- TextAnnotation - the text of the token
- TokenBeginAnnotation - the byte offset of the token (start)
- TokenEndAnnotation - the byte offset of the token (end)
|
List<RobustTokenizer.WordToken> |
tokenizeToWordTokens()
Tokenizes a natural language string
|
static String |
zeroOrMore(String s)
zero or more
|
static String |
zeroOrOne(String s)
zero or one
|
hasNext, next, peek, remove, tokenize
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
forEachRemaining
public static final int MAX_MULTI_WORD_SIZE
public static final String DOT
public static final String DOTDOT
public static final String APOSTROPHE
public static final String SLASH
public static final String UNDERSCORE
public static final String MINUS
public static final String PLUS
public static final String COMMA
public static final String DOTCOMMA
public static final String QUOTES
public static final String DOUBLE_QUOTES
public static final String LRB
public static final String RRB
public static final String LCB
public static final String RCB
public static final String GREATER
public static final String LOWER
public static final String AMPERSAND
public static final String AT
public static final String HTTP
public static final String WHITE_SPACE
public static final String DIGIT
public static final String LETTER
public static final String UPPER
public static final String SIGN
public static final String FULLNUM
public static final String DECNUM
public static final String NUM
public static final String DATE
public static final String TIME
public static final String PUNC
public static final String LETTERS
public static final String BLOCK
public static final String WORD
public static final String ACRONYM
public static final String LOOSE_ACRONYM
public static final String PAREN
public static final String SGML
public static final String HTMLCODE
public static final String ANY
public static final String EMAIL
public static final String DOMAIN_EMAIL
public static final String URL
public static final String SMALL_URL
public static final String UNDERSCORESEQ
public static final String LIST_BULLET
public static final String PHONE_PART
public static final String DIGITSEQ
public static final String RECOGNISED_PATTERN
public RobustTokenizer(String buffer)
public RobustTokenizer(boolean caseInsensitive, String buffer)
public static String or(String s1, String s2, String s3, String s4, String s5, String s6)
public static String or(String s1, String s2, String s3, String s4, String s5, String s6, String s7)
public static String or(String s1, String s2, String s3, String s4, String s5, String s6, String s7, String s8)
public static String or(String s1, String s2, String s3, String s4, String s5, String s6, String s7, String s8, String s9)
public static String or(String s1, String s2, String s3, String s4, String s5, String s6, String s7, String s8, String s9, String s10)
public static String or(String s1, String s2, String s3, String s4, String s5, String s6, String s7, String s8, String s9, String s10, String s11)
public static String or(String s1, String s2, String s3, String s4, String s5, String s6, String s7, String s8, String s9, String s10, String s11, String s12)
public static boolean isUrl(String s)
public static boolean isEmail(String s)
public static boolean isSgml(String s)
public static boolean isSlashDate(String s)
public static boolean isAcronym(String s)
public static boolean isDigitSeq(String s)
public int countNewLines(String s, int start, int end)
public Word[] tokenizeToWords()
public List<RobustTokenizer.WordToken> tokenizeToWordTokens()
protected List<RobustTokenizer.WordToken> postprocess(List<RobustTokenizer.WordToken> tokens)
tokens
- public String tokenizeText() throws IOException
IOException
protected Word getNext()
AbstractTokenizer
getNext
in class AbstractTokenizer<Word>