public class PhraseTable
extends java.lang.Object
implements java.io.Serializable
Modifier and Type | Class and Description |
---|---|
static class |
PhraseTable.Phrase
A phrase is a multiword expression
|
static class |
PhraseTable.PhraseMatch
Represents a matched phrase
|
static class |
PhraseTable.PhraseStringCollection |
static class |
PhraseTable.StringList |
static class |
PhraseTable.TokenList |
static interface |
PhraseTable.WordList |
Modifier and Type | Field and Description |
---|---|
boolean |
caseInsensitive |
boolean |
ignorePunctuation |
boolean |
ignorePunctuationTokens |
boolean |
normalize |
static java.util.Comparator<PhraseTable.PhraseMatch> |
PHRASEMATCH_LENGTH_ENDPOINTS_COMPARATOR |
Annotator |
tokenizer |
Constructor and Description |
---|
PhraseTable() |
PhraseTable(boolean normalize,
boolean caseInsensitive,
boolean ignorePunctuation) |
PhraseTable(int initSize) |
Modifier and Type | Method and Description |
---|---|
boolean |
addPhrase(java.util.List<java.lang.String> tokens) |
boolean |
addPhrase(java.util.List<java.lang.String> tokens,
java.lang.String tag) |
boolean |
addPhrase(java.util.List<java.lang.String> tokens,
java.lang.String tag,
java.lang.Object phraseData) |
boolean |
addPhrase(java.lang.String phraseText) |
boolean |
addPhrase(java.lang.String phraseText,
java.lang.String tag) |
boolean |
addPhrase(java.lang.String phraseText,
java.lang.String tag,
java.lang.Object phraseData) |
void |
addPhrases(java.util.Collection<java.lang.String> phraseTexts) |
void |
addPhrases(java.util.Map<java.lang.String,java.lang.String> taggedPhraseTexts) |
protected int |
checkWordListMatch(PhraseTable.Phrase phrase,
PhraseTable.WordList tokens,
int tokenStart,
int tokenEnd,
int checkStart,
boolean matchEnd) |
void |
clear()
Clears this table
|
boolean |
containsKey(java.lang.Object key) |
java.util.List<PhraseTable.PhraseMatch> |
findAllMatches(java.util.List<PhraseTable.Phrase> acceptablePhrases,
PhraseTable.WordList tokens)
Given a list of tokens, returns list of spans (PhraseMatch) that corresponds
to a phrase in the table (filtered by the list of acceptable phrase)
|
java.util.List<PhraseTable.PhraseMatch> |
findAllMatches(java.util.List<PhraseTable.Phrase> acceptablePhrases,
PhraseTable.WordList tokens,
int tokenStart,
int tokenEnd,
boolean needNormalization) |
java.util.List<PhraseTable.PhraseMatch> |
findAllMatches(java.util.List<PhraseTable.Phrase> acceptablePhrases,
java.lang.String text)
Given a segment of text, returns list of spans (PhraseMatch) that corresponds
to a phrase in the table (filtered by the list of acceptable phrase)
|
java.util.List<PhraseTable.PhraseMatch> |
findAllMatches(PhraseTable.WordList tokens)
Given a list of tokens, returns list of spans (PhraseMatch) that corresponds
to a phrase in the table
|
java.util.List<PhraseTable.PhraseMatch> |
findAllMatches(PhraseTable.WordList tokens,
int tokenStart,
int tokenEnd,
boolean needNormalization) |
java.util.List<PhraseTable.PhraseMatch> |
findAllMatches(java.lang.String text)
Given a segment of text, returns list of spans (PhraseMatch) that corresponds
to a phrase in the table
|
protected java.util.List<PhraseTable.PhraseMatch> |
findMatches(java.util.Collection<PhraseTable.Phrase> acceptablePhrases,
PhraseTable.WordList tokens,
int tokenStart,
int tokenEnd,
boolean needNormalization,
boolean findAll,
boolean matchEnd) |
java.util.List<PhraseTable.PhraseMatch> |
findMatches(PhraseTable.WordList tokens) |
java.util.List<PhraseTable.PhraseMatch> |
findMatches(PhraseTable.WordList tokens,
int tokenStart,
int tokenEnd,
boolean needNormalization) |
java.util.List<PhraseTable.PhraseMatch> |
findMatches(java.lang.String text) |
java.util.List<PhraseTable.PhraseMatch> |
findMatches(java.lang.String text,
int tokenStart,
int tokenEnd,
boolean needNormalization) |
protected java.util.List<PhraseTable.PhraseMatch> |
findMatchesNormalized(java.util.Collection<PhraseTable.Phrase> acceptablePhrases,
PhraseTable.WordList tokens,
int tokenStart,
int tokenEnd,
boolean findAll,
boolean matchEnd) |
java.util.List<PhraseTable.PhraseMatch> |
findNonOverlappingPhrases(java.util.List<PhraseTable.PhraseMatch> phraseMatches) |
PhraseTable.Phrase |
get(java.lang.Object key) |
static PhraseTable.Phrase |
getLongestPhrase(java.util.List<PhraseTable.Phrase> phrases) |
java.lang.String |
getNormalizedForm(java.lang.String word) |
boolean |
isEmpty() |
java.util.Iterator<PhraseTable.Phrase> |
iterator() |
PhraseTable.Phrase |
lookup(PhraseTable.WordList wordList) |
PhraseTable.Phrase |
lookup(java.lang.String phrase) |
PhraseTable.Phrase |
lookupNormalized(java.lang.String phrase) |
void |
readPhrases(java.lang.String filename,
boolean checkTag)
Read in phrases from a file (assumed to be tab delimited)
|
void |
readPhrases(java.lang.String filename,
boolean checkTag,
java.util.regex.Pattern delimiterPattern) |
void |
readPhrases(java.lang.String filename,
boolean checkTag,
java.lang.String delimiterRegex)
Read in phrases from a file.
|
void |
readPhrases(java.lang.String filename,
int phraseColIndex,
int tagColIndex) |
void |
readPhrasesWithTagScores(java.lang.String filename)
Read in phrases where there is each pattern has a score of being associated with a certain tag.
|
void |
readPhrasesWithTagScores(java.lang.String filename,
java.util.regex.Pattern fieldDelimiterPattern,
java.util.regex.Pattern countDelimiterPattern) |
void |
readPhrasesWithTagScores(java.lang.String filename,
java.lang.String fieldDelimiterRegex,
java.lang.String countDelimiterRegex) |
void |
setNormalizationCacheSize(int cacheSize) |
java.lang.String[] |
splitText(java.lang.String phraseText) |
PhraseTable.WordList |
toNormalizedWordList(java.lang.String phraseText) |
static java.lang.String |
toString(PhraseTable.WordList wordList) |
PhraseTable.WordList |
toWordList(java.lang.String phraseText) |
public boolean normalize
public boolean caseInsensitive
public boolean ignorePunctuation
public boolean ignorePunctuationTokens
public Annotator tokenizer
public static final java.util.Comparator<PhraseTable.PhraseMatch> PHRASEMATCH_LENGTH_ENDPOINTS_COMPARATOR
public PhraseTable()
public PhraseTable(int initSize)
public PhraseTable(boolean normalize, boolean caseInsensitive, boolean ignorePunctuation)
public boolean isEmpty()
public boolean containsKey(java.lang.Object key)
public PhraseTable.Phrase get(java.lang.Object key)
public void clear()
public void setNormalizationCacheSize(int cacheSize)
public void readPhrases(java.lang.String filename, boolean checkTag) throws java.io.IOException
filename
- - Name of filecheckTag
- - Indicates if there is a tag column (assumed to be 2nd column)
If false, treats entire line as the phrasejava.io.IOException
public void readPhrases(java.lang.String filename, boolean checkTag, java.lang.String delimiterRegex) throws java.io.IOException
filename
- - Name of filecheckTag
- - Indicates if there is a tag column (assumed to be 2nd column)
If false, treats entire line as the phrasedelimiterRegex
- - Regex for identifying column delimiterjava.io.IOException
public void readPhrases(java.lang.String filename, boolean checkTag, java.util.regex.Pattern delimiterPattern) throws java.io.IOException
java.io.IOException
public void readPhrasesWithTagScores(java.lang.String filename) throws java.io.IOException
filename
- java.io.IOException
public void readPhrasesWithTagScores(java.lang.String filename, java.lang.String fieldDelimiterRegex, java.lang.String countDelimiterRegex) throws java.io.IOException
java.io.IOException
public void readPhrasesWithTagScores(java.lang.String filename, java.util.regex.Pattern fieldDelimiterPattern, java.util.regex.Pattern countDelimiterPattern) throws java.io.IOException
java.io.IOException
public void readPhrases(java.lang.String filename, int phraseColIndex, int tagColIndex) throws java.io.IOException
java.io.IOException
public static PhraseTable.Phrase getLongestPhrase(java.util.List<PhraseTable.Phrase> phrases)
public java.lang.String[] splitText(java.lang.String phraseText)
public PhraseTable.WordList toWordList(java.lang.String phraseText)
public PhraseTable.WordList toNormalizedWordList(java.lang.String phraseText)
public void addPhrases(java.util.Collection<java.lang.String> phraseTexts)
public void addPhrases(java.util.Map<java.lang.String,java.lang.String> taggedPhraseTexts)
public boolean addPhrase(java.lang.String phraseText)
public boolean addPhrase(java.lang.String phraseText, java.lang.String tag)
public boolean addPhrase(java.lang.String phraseText, java.lang.String tag, java.lang.Object phraseData)
public boolean addPhrase(java.util.List<java.lang.String> tokens)
public boolean addPhrase(java.util.List<java.lang.String> tokens, java.lang.String tag)
public boolean addPhrase(java.util.List<java.lang.String> tokens, java.lang.String tag, java.lang.Object phraseData)
public java.lang.String getNormalizedForm(java.lang.String word)
public PhraseTable.Phrase lookup(java.lang.String phrase)
public PhraseTable.Phrase lookupNormalized(java.lang.String phrase)
public PhraseTable.Phrase lookup(PhraseTable.WordList wordList)
public java.util.List<PhraseTable.PhraseMatch> findAllMatches(java.lang.String text)
text
- Input text to search overpublic java.util.List<PhraseTable.PhraseMatch> findAllMatches(PhraseTable.WordList tokens)
tokens
- List of tokens to search overpublic java.util.List<PhraseTable.PhraseMatch> findAllMatches(java.util.List<PhraseTable.Phrase> acceptablePhrases, java.lang.String text)
acceptablePhrases
- - What phrases to look for (need to be subset of phrases already in table)text
- Input text to search overpublic java.util.List<PhraseTable.PhraseMatch> findAllMatches(java.util.List<PhraseTable.Phrase> acceptablePhrases, PhraseTable.WordList tokens)
acceptablePhrases
- - What phrases to look for (need to be subset of phrases already in table)tokens
- List of tokens to search overpublic java.util.List<PhraseTable.PhraseMatch> findAllMatches(PhraseTable.WordList tokens, int tokenStart, int tokenEnd, boolean needNormalization)
public java.util.List<PhraseTable.PhraseMatch> findAllMatches(java.util.List<PhraseTable.Phrase> acceptablePhrases, PhraseTable.WordList tokens, int tokenStart, int tokenEnd, boolean needNormalization)
public java.util.List<PhraseTable.PhraseMatch> findMatches(java.lang.String text)
public java.util.List<PhraseTable.PhraseMatch> findMatches(PhraseTable.WordList tokens)
public java.util.List<PhraseTable.PhraseMatch> findMatches(PhraseTable.WordList tokens, int tokenStart, int tokenEnd, boolean needNormalization)
public java.util.List<PhraseTable.PhraseMatch> findMatches(java.lang.String text, int tokenStart, int tokenEnd, boolean needNormalization)
protected int checkWordListMatch(PhraseTable.Phrase phrase, PhraseTable.WordList tokens, int tokenStart, int tokenEnd, int checkStart, boolean matchEnd)
public java.util.List<PhraseTable.PhraseMatch> findNonOverlappingPhrases(java.util.List<PhraseTable.PhraseMatch> phraseMatches)
protected java.util.List<PhraseTable.PhraseMatch> findMatches(java.util.Collection<PhraseTable.Phrase> acceptablePhrases, PhraseTable.WordList tokens, int tokenStart, int tokenEnd, boolean needNormalization, boolean findAll, boolean matchEnd)
protected java.util.List<PhraseTable.PhraseMatch> findMatchesNormalized(java.util.Collection<PhraseTable.Phrase> acceptablePhrases, PhraseTable.WordList tokens, int tokenStart, int tokenEnd, boolean findAll, boolean matchEnd)
public java.util.Iterator<PhraseTable.Phrase> iterator()
public static java.lang.String toString(PhraseTable.WordList wordList)