|
|||||||||
| PREV CLASS NEXT CLASS | FRAMES NO FRAMES | ||||||||
| SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD | ||||||||
java.lang.Objectedu.stanford.nlp.parser.lexparser.ChineseCharacterBasedLexicon
public class ChineseCharacterBasedLexicon
| Field Summary |
|---|
| Fields inherited from interface edu.stanford.nlp.parser.lexparser.Lexicon |
|---|
BOUNDARY, BOUNDARY_TAG, UNKNOWN_WORD |
| Constructor Summary | |
|---|---|
ChineseCharacterBasedLexicon(ChineseTreebankParserParams params,
Index<java.lang.String> wordIndex,
Index<java.lang.String> tagIndex)
|
|
| Method Summary | |
|---|---|
void |
finishTraining()
Done collecting statistics for the lexicon. |
Distribution<java.lang.String> |
getPOSDistribution()
|
UnknownWordModel |
getUnknownWordModel()
|
void |
incrementTreesRead(double weight)
If training on a per-word basis instead of on a per-tree basis, we will want to increment the tree count as this happens |
void |
initializeTraining(double numTrees)
Start training this lexicon on the expected number of trees. |
static boolean |
isForeign(java.lang.String s)
|
boolean |
isKnown(int word)
Checks whether a word is in the lexicon. |
boolean |
isKnown(java.lang.String word)
Checks whether a word is in the lexicon. |
int |
numRules()
Returns the number of rules (tag rewrites as word) in the Lexicon. |
void |
readData(java.io.BufferedReader in)
Read the lexicon from the BufferedReader in the format written by writeData. |
java.util.Iterator<IntTaggedWord> |
ruleIteratorByWord(int word,
int loc,
java.lang.String featureSpec)
Get an iterator over all rules (pairs of (word, POS)) for this word. |
java.util.Iterator<IntTaggedWord> |
ruleIteratorByWord(java.lang.String word,
int loc,
java.lang.String featureSpec)
Same thing, but with a string that needs to be translated by the lexicon's word index |
java.lang.String |
sampleFrom()
Samples over words regardless of POS: first samples POS, then samples word according to that POS |
java.lang.String |
sampleFrom(java.lang.String tag)
Samples from the distribution over words with this POS according to the lexicon. |
float |
score(IntTaggedWord iTW,
int loc,
java.lang.String word,
java.lang.String featureSpec)
Get the score of this word with this tag (as an IntTaggedWord) at this loc. |
void |
setUnknownWordModel(UnknownWordModel uwm)
|
void |
train(java.util.Collection<Tree> trees)
Train this lexicon on the given set of trees. |
void |
train(java.util.Collection<Tree> trees,
java.util.Collection<Tree> rawTrees)
|
void |
train(java.util.Collection<Tree> trees,
double weight)
Train this lexicon on the given set of trees. |
void |
train(java.util.List<TaggedWord> sentence,
double weight)
Not all subclasses support this particular method. |
void |
train(TaggedWord tw,
int loc,
double weight)
Not all subclasses support this particular method. |
void |
train(Tree tree,
double weight)
TODO: make this method do something with the weight |
void |
trainUnannotated(java.util.List<TaggedWord> sentence,
double weight)
Sometimes we might have a sentence of tagged words which we would like to add to the lexicon, but they weren't part of a binarized, markovized, or otherwize annotated tree. |
void |
writeData(java.io.Writer w)
Write the lexicon in human-readable format to the Writer. |
| Methods inherited from class java.lang.Object |
|---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
| Constructor Detail |
|---|
public ChineseCharacterBasedLexicon(ChineseTreebankParserParams params,
Index<java.lang.String> wordIndex,
Index<java.lang.String> tagIndex)
| Method Detail |
|---|
public void initializeTraining(double numTrees)
Lexicon
initializeTraining in interface Lexiconpublic void train(java.util.Collection<Tree> trees)
train in interface Lexicontrees - Trees to train on
public void train(java.util.Collection<Tree> trees,
double weight)
train in interface Lexicon
public void train(Tree tree,
double weight)
train in interface Lexicon
public void trainUnannotated(java.util.List<TaggedWord> sentence,
double weight)
Lexicon
trainUnannotated in interface Lexiconpublic void incrementTreesRead(double weight)
Lexicon
incrementTreesRead in interface Lexicon
public void train(TaggedWord tw,
int loc,
double weight)
Lexicon
train in interface Lexicon
public void train(java.util.List<TaggedWord> sentence,
double weight)
Lexicon
train in interface Lexiconpublic void finishTraining()
Lexicon
finishTraining in interface Lexiconpublic Distribution<java.lang.String> getPOSDistribution()
public static boolean isForeign(java.lang.String s)
public float score(IntTaggedWord iTW,
int loc,
java.lang.String word,
java.lang.String featureSpec)
Lexicon
score in interface LexiconiTW - An IntTaggedWord pairing a word and POS tagloc - The position in the sentence. In the default implementation
this is used only for unknown words to change their
probability distribution when sentence initial.word - The word itself; useful so we don't have to look it
up in an indexfeatureSpec - TODO
public java.lang.String sampleFrom(java.lang.String tag)
tag - the POS of the word to sample
public java.lang.String sampleFrom()
public java.util.Iterator<IntTaggedWord> ruleIteratorByWord(int word,
int loc,
java.lang.String featureSpec)
Lexicon
ruleIteratorByWord in interface Lexiconword - The word, represented as an integer in Indexloc - The position of the word in the sentence (counting from 0).
Implementation note: The BaseLexicon class doesn't
actually make use of this position information.featureSpec - Additional word features like morphosyntactic information.
tag -> word rule.)
public java.util.Iterator<IntTaggedWord> ruleIteratorByWord(java.lang.String word,
int loc,
java.lang.String featureSpec)
Lexicon
ruleIteratorByWord in interface Lexiconpublic int numRules()
numRules in interface Lexicon
public void readData(java.io.BufferedReader in)
throws java.io.IOException
Lexicon
readData in interface Lexiconin - The BufferedReader to read from
java.io.IOException - If any I/O problem
public void writeData(java.io.Writer w)
throws java.io.IOException
Lexicon
writeData in interface Lexiconw - The writer to output to
java.io.IOException - If any I/O problempublic boolean isKnown(int word)
Lexicon
isKnown in interface Lexiconword - The word as an int
public boolean isKnown(java.lang.String word)
Lexicon
isKnown in interface Lexiconword - The word as a String
public UnknownWordModel getUnknownWordModel()
getUnknownWordModel in interface Lexiconpublic void setUnknownWordModel(UnknownWordModel uwm)
setUnknownWordModel in interface Lexicon
public void train(java.util.Collection<Tree> trees,
java.util.Collection<Tree> rawTrees)
train in interface Lexicon
|
|||||||||
| PREV CLASS NEXT CLASS | FRAMES NO FRAMES | ||||||||
| SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD | ||||||||