public class StatTokSentTrainer
extends java.lang.Object
# newdoc = tanl
# sent_id = isst_tanl-1
# text = LONDRA.
1 LONDRA Londra PROPN SP _ 0 root _ SpaceAfter=No
2 . . PUNCT FS _ 1 punct _ _
# sent_id = isst_tanl-2
# text = Gas dalla statua.
extracted features for tokenizer:
n characters before
n characters after
character case
| Modifier and Type | Field and Description |
|---|---|
static java.util.Set<java.lang.String> |
ARGS_TO_DROP |
| Constructor and Description |
|---|
StatTokSentTrainer(java.lang.String[] propertiesArguments) |
| Modifier and Type | Method and Description |
|---|---|
java.util.List<java.lang.String> |
addFeatures(java.util.ArrayList<Pair<java.lang.String,java.lang.String>> classCharsText,
int windowSize)
This method adds features for building the training input for the classifier.
|
java.util.ArrayList<Pair<java.lang.String,java.lang.String>> |
fileToTrainSet(java.lang.String trainFile,
java.util.Map<java.lang.String,java.lang.String[]> multiWordRules) |
static void |
help() |
static java.util.Map<java.lang.String,java.lang.String[]> |
inferMultiWordRules(java.lang.String trainFile)
Method to infer multi-word token rules directly from the training set for tokenization.
|
static void |
main(java.lang.String[] args)
Main method to train the tokenizer.
|
static java.util.Map<java.lang.String,java.lang.String[]> |
readMultiWordRules(java.lang.String multiWordRulesFile)
Method to read multi-word token rules from a file.
|
static void |
serialize(java.lang.String serializeTo,
ColumnDataClassifier cdc,
int windowSize) |
static void |
writeMultiWordRules(java.lang.String multiWordRulesFile,
java.util.Map<java.lang.String,java.lang.String[]> rules) |
public StatTokSentTrainer(java.lang.String[] propertiesArguments)
public java.util.ArrayList<Pair<java.lang.String,java.lang.String>> fileToTrainSet(java.lang.String trainFile, java.util.Map<java.lang.String,java.lang.String[]> multiWordRules) throws java.io.IOException, java.io.FileNotFoundException
java.io.IOExceptionjava.io.FileNotFoundExceptionpublic java.util.List<java.lang.String> addFeatures(java.util.ArrayList<Pair<java.lang.String,java.lang.String>> classCharsText, int windowSize)
public static java.util.Map<java.lang.String,java.lang.String[]> readMultiWordRules(java.lang.String multiWordRulesFile)
throws java.io.IOException
java.io.IOExceptionpublic static void writeMultiWordRules(java.lang.String multiWordRulesFile,
java.util.Map<java.lang.String,java.lang.String[]> rules)
throws java.io.IOException
java.io.IOExceptionpublic static java.util.Map<java.lang.String,java.lang.String[]> inferMultiWordRules(java.lang.String trainFile)
throws java.io.IOException,
java.io.FileNotFoundException
java.io.IOExceptionjava.io.FileNotFoundExceptionpublic static void help()
public static void serialize(java.lang.String serializeTo,
ColumnDataClassifier cdc,
int windowSize)
throws java.io.IOException
java.io.IOExceptionpublic static void main(java.lang.String[] args)
throws java.io.IOException
java.io.IOException