public class StatTokSentTrainer
extends java.lang.Object
# newdoc = tanl # sent_id = isst_tanl-1 # text = LONDRA. 1 LONDRA Londra PROPN SP _ 0 root _ SpaceAfter=No 2 . . PUNCT FS _ 1 punct _ _ # sent_id = isst_tanl-2 # text = Gas dalla statua.extracted features for tokenizer:
n characters before n characters after character case
Modifier and Type | Field and Description |
---|---|
static java.util.Set<java.lang.String> |
ARGS_TO_DROP |
Constructor and Description |
---|
StatTokSentTrainer(java.lang.String[] propertiesArguments) |
Modifier and Type | Method and Description |
---|---|
java.util.List<java.lang.String> |
addFeatures(java.util.ArrayList<Pair<java.lang.String,java.lang.String>> classCharsText,
int windowSize)
This method adds features for building the training input for the classifier.
|
java.util.ArrayList<Pair<java.lang.String,java.lang.String>> |
fileToTrainSet(java.lang.String trainFile,
java.util.Map<java.lang.String,java.lang.String[]> multiWordRules) |
static void |
help() |
static java.util.Map<java.lang.String,java.lang.String[]> |
inferMultiWordRules(java.lang.String trainFile)
Method to infer multi-word token rules directly from the training set for tokenization.
|
static void |
main(java.lang.String[] args)
Main method to train the tokenizer.
|
static java.util.Map<java.lang.String,java.lang.String[]> |
readMultiWordRules(java.lang.String multiWordRulesFile)
Method to read multi-word token rules from a file.
|
static void |
serialize(java.lang.String serializeTo,
ColumnDataClassifier cdc,
int windowSize) |
static void |
writeMultiWordRules(java.lang.String multiWordRulesFile,
java.util.Map<java.lang.String,java.lang.String[]> rules) |
public StatTokSentTrainer(java.lang.String[] propertiesArguments)
public java.util.ArrayList<Pair<java.lang.String,java.lang.String>> fileToTrainSet(java.lang.String trainFile, java.util.Map<java.lang.String,java.lang.String[]> multiWordRules) throws java.io.IOException, java.io.FileNotFoundException
java.io.IOException
java.io.FileNotFoundException
public java.util.List<java.lang.String> addFeatures(java.util.ArrayList<Pair<java.lang.String,java.lang.String>> classCharsText, int windowSize)
public static java.util.Map<java.lang.String,java.lang.String[]> readMultiWordRules(java.lang.String multiWordRulesFile) throws java.io.IOException
java.io.IOException
public static void writeMultiWordRules(java.lang.String multiWordRulesFile, java.util.Map<java.lang.String,java.lang.String[]> rules) throws java.io.IOException
java.io.IOException
public static java.util.Map<java.lang.String,java.lang.String[]> inferMultiWordRules(java.lang.String trainFile) throws java.io.IOException, java.io.FileNotFoundException
java.io.IOException
java.io.FileNotFoundException
public static void help()
public static void serialize(java.lang.String serializeTo, ColumnDataClassifier cdc, int windowSize) throws java.io.IOException
java.io.IOException
public static void main(java.lang.String[] args) throws java.io.IOException
java.io.IOException