public class PhraseTable extends Object implements Serializable
Modifier and Type | Class and Description |
---|---|
static class |
PhraseTable.Phrase
A phrase is a multiword expression
|
static class |
PhraseTable.PhraseMatch
Represents a matched phrase
|
static class |
PhraseTable.PhraseStringCollection |
static class |
PhraseTable.StringList |
static class |
PhraseTable.TokenList |
static interface |
PhraseTable.WordList |
Modifier and Type | Field and Description |
---|---|
boolean |
caseInsensitive |
boolean |
ignorePunctuation |
boolean |
ignorePunctuationTokens |
boolean |
normalize |
static Comparator<PhraseTable.PhraseMatch> |
PHRASEMATCH_LENGTH_ENDPOINTS_COMPARATOR |
Annotator |
tokenizer |
Constructor and Description |
---|
PhraseTable() |
PhraseTable(boolean normalize,
boolean caseInsensitive,
boolean ignorePunctuation) |
PhraseTable(int initSize) |
Modifier and Type | Method and Description |
---|---|
boolean |
addPhrase(List<String> tokens) |
boolean |
addPhrase(List<String> tokens,
String tag) |
boolean |
addPhrase(List<String> tokens,
String tag,
Object phraseData) |
boolean |
addPhrase(String phraseText) |
boolean |
addPhrase(String phraseText,
String tag) |
boolean |
addPhrase(String phraseText,
String tag,
Object phraseData) |
void |
addPhrases(Collection<String> phraseTexts) |
void |
addPhrases(Map<String,String> taggedPhraseTexts) |
protected int |
checkWordListMatch(PhraseTable.Phrase phrase,
PhraseTable.WordList tokens,
int tokenStart,
int tokenEnd,
int checkStart,
boolean matchEnd) |
void |
clear()
Clears this table
|
boolean |
containsKey(Object key) |
List<PhraseTable.PhraseMatch> |
findAllMatches(List<PhraseTable.Phrase> acceptablePhrases,
PhraseTable.WordList tokens)
Given a list of tokens, returns list of spans (PhraseMatch) that corresponds
to a phrase in the table (filtered by the list of acceptable phrase)
|
List<PhraseTable.PhraseMatch> |
findAllMatches(List<PhraseTable.Phrase> acceptablePhrases,
PhraseTable.WordList tokens,
int tokenStart,
int tokenEnd,
boolean needNormalization) |
List<PhraseTable.PhraseMatch> |
findAllMatches(List<PhraseTable.Phrase> acceptablePhrases,
String text)
Given a segment of text, returns list of spans (PhraseMatch) that corresponds
to a phrase in the table (filtered by the list of acceptable phrase)
|
List<PhraseTable.PhraseMatch> |
findAllMatches(PhraseTable.WordList tokens)
Given a list of tokens, returns list of spans (PhraseMatch) that corresponds
to a phrase in the table
|
List<PhraseTable.PhraseMatch> |
findAllMatches(PhraseTable.WordList tokens,
int tokenStart,
int tokenEnd,
boolean needNormalization) |
List<PhraseTable.PhraseMatch> |
findAllMatches(String text)
Given a segment of text, returns list of spans (PhraseMatch) that corresponds
to a phrase in the table
|
protected List<PhraseTable.PhraseMatch> |
findMatches(Collection<PhraseTable.Phrase> acceptablePhrases,
PhraseTable.WordList tokens,
int tokenStart,
int tokenEnd,
boolean needNormalization,
boolean findAll,
boolean matchEnd) |
List<PhraseTable.PhraseMatch> |
findMatches(PhraseTable.WordList tokens) |
List<PhraseTable.PhraseMatch> |
findMatches(PhraseTable.WordList tokens,
int tokenStart,
int tokenEnd,
boolean needNormalization) |
List<PhraseTable.PhraseMatch> |
findMatches(String text) |
List<PhraseTable.PhraseMatch> |
findMatches(String text,
int tokenStart,
int tokenEnd,
boolean needNormalization) |
protected List<PhraseTable.PhraseMatch> |
findMatchesNormalized(Collection<PhraseTable.Phrase> acceptablePhrases,
PhraseTable.WordList tokens,
int tokenStart,
int tokenEnd,
boolean findAll,
boolean matchEnd) |
List<PhraseTable.PhraseMatch> |
findNonOverlappingPhrases(List<PhraseTable.PhraseMatch> phraseMatches) |
PhraseTable.Phrase |
get(Object key) |
static PhraseTable.Phrase |
getLongestPhrase(List<PhraseTable.Phrase> phrases) |
String |
getNormalizedForm(String word) |
boolean |
isEmpty() |
Iterator<PhraseTable.Phrase> |
iterator() |
PhraseTable.Phrase |
lookup(PhraseTable.WordList wordList) |
PhraseTable.Phrase |
lookup(String phrase) |
PhraseTable.Phrase |
lookupNormalized(String phrase) |
void |
readPhrases(String filename,
boolean checkTag)
Read in phrases from a file (assumed to be tab delimited)
|
void |
readPhrases(String filename,
boolean checkTag,
Pattern delimiterPattern) |
void |
readPhrases(String filename,
boolean checkTag,
String delimiterRegex)
Read in phrases from a file.
|
void |
readPhrases(String filename,
int phraseColIndex,
int tagColIndex) |
void |
readPhrasesWithTagScores(String filename)
Read in phrases where there is each pattern has a score of being associated with a certain tag.
|
void |
readPhrasesWithTagScores(String filename,
Pattern fieldDelimiterPattern,
Pattern countDelimiterPattern) |
void |
readPhrasesWithTagScores(String filename,
String fieldDelimiterRegex,
String countDelimiterRegex) |
void |
setNormalizationCacheSize(int cacheSize) |
String[] |
splitText(String phraseText) |
PhraseTable.WordList |
toNormalizedWordList(String phraseText) |
static String |
toString(PhraseTable.WordList wordList) |
PhraseTable.WordList |
toWordList(String phraseText) |
public boolean normalize
public boolean caseInsensitive
public boolean ignorePunctuation
public boolean ignorePunctuationTokens
public Annotator tokenizer
public static final Comparator<PhraseTable.PhraseMatch> PHRASEMATCH_LENGTH_ENDPOINTS_COMPARATOR
public PhraseTable()
public PhraseTable(int initSize)
public PhraseTable(boolean normalize, boolean caseInsensitive, boolean ignorePunctuation)
public boolean isEmpty()
public boolean containsKey(Object key)
public PhraseTable.Phrase get(Object key)
public void clear()
public void setNormalizationCacheSize(int cacheSize)
public void readPhrases(String filename, boolean checkTag) throws IOException
filename
- - Name of filecheckTag
- - Indicates if there is a tag column (assumed to be 2nd column)
If false, treats entire line as the phraseIOException
public void readPhrases(String filename, boolean checkTag, String delimiterRegex) throws IOException
filename
- - Name of filecheckTag
- - Indicates if there is a tag column (assumed to be 2nd column)
If false, treats entire line as the phrasedelimiterRegex
- - Regex for identifying column delimiterIOException
public void readPhrases(String filename, boolean checkTag, Pattern delimiterPattern) throws IOException
IOException
public void readPhrasesWithTagScores(String filename) throws IOException
filename
- IOException
public void readPhrasesWithTagScores(String filename, String fieldDelimiterRegex, String countDelimiterRegex) throws IOException
IOException
public void readPhrasesWithTagScores(String filename, Pattern fieldDelimiterPattern, Pattern countDelimiterPattern) throws IOException
IOException
public void readPhrases(String filename, int phraseColIndex, int tagColIndex) throws IOException
IOException
public static PhraseTable.Phrase getLongestPhrase(List<PhraseTable.Phrase> phrases)
public PhraseTable.WordList toWordList(String phraseText)
public PhraseTable.WordList toNormalizedWordList(String phraseText)
public void addPhrases(Collection<String> phraseTexts)
public boolean addPhrase(String phraseText)
public PhraseTable.Phrase lookup(String phrase)
public PhraseTable.Phrase lookupNormalized(String phrase)
public PhraseTable.Phrase lookup(PhraseTable.WordList wordList)
public List<PhraseTable.PhraseMatch> findAllMatches(String text)
text
- Input text to search overpublic List<PhraseTable.PhraseMatch> findAllMatches(PhraseTable.WordList tokens)
tokens
- List of tokens to search overpublic List<PhraseTable.PhraseMatch> findAllMatches(List<PhraseTable.Phrase> acceptablePhrases, String text)
acceptablePhrases
- - What phrases to look for (need to be subset of phrases already in table)text
- Input text to search overpublic List<PhraseTable.PhraseMatch> findAllMatches(List<PhraseTable.Phrase> acceptablePhrases, PhraseTable.WordList tokens)
acceptablePhrases
- - What phrases to look for (need to be subset of phrases already in table)tokens
- List of tokens to search overpublic List<PhraseTable.PhraseMatch> findAllMatches(PhraseTable.WordList tokens, int tokenStart, int tokenEnd, boolean needNormalization)
public List<PhraseTable.PhraseMatch> findAllMatches(List<PhraseTable.Phrase> acceptablePhrases, PhraseTable.WordList tokens, int tokenStart, int tokenEnd, boolean needNormalization)
public List<PhraseTable.PhraseMatch> findMatches(String text)
public List<PhraseTable.PhraseMatch> findMatches(PhraseTable.WordList tokens)
public List<PhraseTable.PhraseMatch> findMatches(PhraseTable.WordList tokens, int tokenStart, int tokenEnd, boolean needNormalization)
public List<PhraseTable.PhraseMatch> findMatches(String text, int tokenStart, int tokenEnd, boolean needNormalization)
protected int checkWordListMatch(PhraseTable.Phrase phrase, PhraseTable.WordList tokens, int tokenStart, int tokenEnd, int checkStart, boolean matchEnd)
public List<PhraseTable.PhraseMatch> findNonOverlappingPhrases(List<PhraseTable.PhraseMatch> phraseMatches)
protected List<PhraseTable.PhraseMatch> findMatches(Collection<PhraseTable.Phrase> acceptablePhrases, PhraseTable.WordList tokens, int tokenStart, int tokenEnd, boolean needNormalization, boolean findAll, boolean matchEnd)
protected List<PhraseTable.PhraseMatch> findMatchesNormalized(Collection<PhraseTable.Phrase> acceptablePhrases, PhraseTable.WordList tokens, int tokenStart, int tokenEnd, boolean findAll, boolean matchEnd)
public Iterator<PhraseTable.Phrase> iterator()
public static String toString(PhraseTable.WordList wordList)