|
|||||||||
| PREV CLASS NEXT CLASS | FRAMES NO FRAMES | ||||||||
| SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD | ||||||||
java.lang.Objectedu.stanford.nlp.parser.lexparser.ChineseLexiconAndWordSegmenter
public class ChineseLexiconAndWordSegmenter
This class lets you train a lexicon and segmenter at the same time.
| Field Summary |
|---|
| Fields inherited from interface edu.stanford.nlp.parser.lexparser.Lexicon |
|---|
BOUNDARY, BOUNDARY_TAG, UNKNOWN_WORD |
| Constructor Summary | |
|---|---|
ChineseLexiconAndWordSegmenter(ChineseLexicon lex,
WordSegmenter seg)
|
|
ChineseLexiconAndWordSegmenter(java.lang.String segmenterFileOrUrl,
Options op)
Construct a new ChineseLexiconAndWordSegmenter. |
|
| Method Summary | |
|---|---|
void |
finishTraining()
Done collecting statistics for the lexicon. |
static ChineseLexiconAndWordSegmenter |
getSegmenterDataFromFile(java.lang.String parserFileOrUrl,
Options op)
|
protected static ChineseLexiconAndWordSegmenter |
getSegmenterDataFromSerializedFile(java.lang.String serializedFileOrUrl)
|
UnknownWordModel |
getUnknownWordModel()
|
void |
incrementTreesRead(double weight)
If training on a per-word basis instead of on a per-tree basis, we will want to increment the tree count as this happens |
void |
initializeTraining(double numTrees)
Start training this lexicon on the expected number of trees. |
boolean |
isKnown(int word)
Checks whether a word is in the lexicon. |
boolean |
isKnown(java.lang.String word)
Checks whether a word is in the lexicon. |
void |
loadSegmenter(java.lang.String filename)
|
static void |
main(java.lang.String[] args)
This method lets you train and test a segmenter relative to a Treebank. |
int |
numRules()
Returns the number of rules (tag rewrites as word) in the Lexicon. |
void |
readData(java.io.BufferedReader in)
Read the lexicon from the BufferedReader in the format written by writeData. |
java.util.Iterator<IntTaggedWord> |
ruleIteratorByWord(int word,
int loc,
java.lang.String featureSpec)
Get an iterator over all rules (pairs of (word, POS)) for this word. |
java.util.Iterator<IntTaggedWord> |
ruleIteratorByWord(java.lang.String word,
int loc,
java.lang.String featureSpec)
Same thing, but with a string that needs to be translated by the lexicon's word index |
float |
score(IntTaggedWord iTW,
int loc,
java.lang.String word,
java.lang.String featureSpec)
Get the score of this word with this tag (as an IntTaggedWord) at this loc. |
java.util.List<HasWord> |
segment(java.lang.String s)
|
void |
setUnknownWordModel(UnknownWordModel uwm)
|
void |
train(java.util.Collection<Tree> trees)
Trains this lexicon on the Collection of trees. |
void |
train(java.util.Collection<Tree> trees,
java.util.Collection<Tree> rawTrees)
|
void |
train(java.util.Collection<Tree> trees,
double weight)
|
void |
train(java.util.List<TaggedWord> sentence)
|
void |
train(java.util.List<TaggedWord> sentence,
double weight)
Not all subclasses support this particular method. |
void |
train(TaggedWord tw,
int loc,
double weight)
Not all subclasses support this particular method. |
void |
train(Tree tree)
|
void |
train(Tree tree,
double weight)
|
void |
trainUnannotated(java.util.List<TaggedWord> sentence,
double weight)
Sometimes we might have a sentence of tagged words which we would like to add to the lexicon, but they weren't part of a binarized, markovized, or otherwize annotated tree. |
void |
writeData(java.io.Writer w)
Write the lexicon in human-readable format to the Writer. |
| Methods inherited from class java.lang.Object |
|---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
| Constructor Detail |
|---|
public ChineseLexiconAndWordSegmenter(ChineseLexicon lex,
WordSegmenter seg)
public ChineseLexiconAndWordSegmenter(java.lang.String segmenterFileOrUrl,
Options op)
java.lang.IllegalArgumentException - If segmenter data cannot be loaded| Method Detail |
|---|
public java.util.List<HasWord> segment(java.lang.String s)
segment in interface WordSegmenterpublic boolean isKnown(int word)
Lexicon
isKnown in interface Lexiconword - The word as an int
public boolean isKnown(java.lang.String word)
Lexicon
isKnown in interface Lexiconword - The word as a String
public java.util.Iterator<IntTaggedWord> ruleIteratorByWord(int word,
int loc,
java.lang.String featureSpec)
Lexicon
ruleIteratorByWord in interface Lexiconword - The word, represented as an integer in Indexloc - The position of the word in the sentence (counting from 0).
Implementation note: The BaseLexicon class doesn't
actually make use of this position information.featureSpec - Additional word features like morphosyntactic information.
tag -> word rule.)
public java.util.Iterator<IntTaggedWord> ruleIteratorByWord(java.lang.String word,
int loc,
java.lang.String featureSpec)
Lexicon
ruleIteratorByWord in interface Lexiconpublic int numRules()
numRules in interface Lexiconpublic void initializeTraining(double numTrees)
Lexicon
initializeTraining in interface LexiconinitializeTraining in interface WordSegmenterpublic void train(java.util.Collection<Tree> trees)
Lexicon
train in interface Lexicontrain in interface WordSegmentertrees - Trees to train on
public void train(java.util.Collection<Tree> trees,
double weight)
train in interface Lexiconpublic void train(Tree tree)
train in interface WordSegmenter
public void train(Tree tree,
double weight)
train in interface Lexiconpublic void train(java.util.List<TaggedWord> sentence)
train in interface WordSegmenter
public void train(java.util.List<TaggedWord> sentence,
double weight)
Lexicon
train in interface Lexicon
public void trainUnannotated(java.util.List<TaggedWord> sentence,
double weight)
Lexicon
trainUnannotated in interface Lexiconpublic void incrementTreesRead(double weight)
Lexicon
incrementTreesRead in interface Lexicon
public void train(TaggedWord tw,
int loc,
double weight)
Lexicon
train in interface Lexiconpublic void finishTraining()
Lexicon
finishTraining in interface LexiconfinishTraining in interface WordSegmenter
public float score(IntTaggedWord iTW,
int loc,
java.lang.String word,
java.lang.String featureSpec)
Lexicon
score in interface LexiconiTW - An IntTaggedWord pairing a word and POS tagloc - The position in the sentence. In the default implementation
this is used only for unknown words to change their
probability distribution when sentence initial.word - The word itself; useful so we don't have to look it
up in an indexfeatureSpec - TODO
public void loadSegmenter(java.lang.String filename)
loadSegmenter in interface WordSegmenter
public void readData(java.io.BufferedReader in)
throws java.io.IOException
Lexicon
readData in interface Lexiconin - The BufferedReader to read from
java.io.IOException - If any I/O problem
public void writeData(java.io.Writer w)
throws java.io.IOException
Lexicon
writeData in interface Lexiconw - The writer to output to
java.io.IOException - If any I/O problem
public static ChineseLexiconAndWordSegmenter getSegmenterDataFromFile(java.lang.String parserFileOrUrl,
Options op)
protected static ChineseLexiconAndWordSegmenter getSegmenterDataFromSerializedFile(java.lang.String serializedFileOrUrl)
public static void main(java.lang.String[] args)
Implementation note: This method is largely cloned from LexicalizedParser's main method. Should we try to have it be able to train segmenters to stop things going out of sync?
public UnknownWordModel getUnknownWordModel()
getUnknownWordModel in interface Lexiconpublic void setUnknownWordModel(UnknownWordModel uwm)
setUnknownWordModel in interface Lexicon
public void train(java.util.Collection<Tree> trees,
java.util.Collection<Tree> rawTrees)
train in interface Lexicon
|
|||||||||
| PREV CLASS NEXT CLASS | FRAMES NO FRAMES | ||||||||
| SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD | ||||||||