public class ConstantsAndVariables extends Object implements Serializable
Modifier and Type | Class and Description |
---|---|
static class |
ConstantsAndVariables.DataSentsIterator |
static class |
ConstantsAndVariables.PatternForEachTokenWay |
static class |
ConstantsAndVariables.ScorePhraseMeasures |
Modifier and Type | Field and Description |
---|---|
boolean |
addIndvWordsFromPhrasesExceptLastAsNeg
For example, if positive seed dict contains "cancer" and "breast cancer" then "breast" is included as negative
|
Map<String,Set<String>> |
allowedNERsforLabels |
Map<String,Set<String>> |
allowedTagsInitials |
String |
allPatternsDir
Cached file of all patterns for all tokens
|
String |
backgroundSymbol |
boolean |
batchProcessSents
Use this option if you are limited by memory ; ignored if fileFormat is ser.
|
boolean |
clubNeighboringLabeledWords |
String |
commonWordsPatternFiles
Words to be ignored when learning phrases if
removePhrasesWithStopWords or
removeStopWordsFromSelectedPhrases is true. |
boolean |
computeAllPatterns
If all patterns should be computed.
|
int |
debug
Debug flag for learning patterns.
|
Map<String,Counter<String>> |
dictOddsWeights |
Map<String,Counter<Integer>> |
distSimWeights |
boolean |
doNotApplyPatterns |
boolean |
doNotExtractPhraseAnyWordLabeledOtherClass
Especially useful for multi word phrase extraction.
|
String |
englishWordsFiles
English words that are not labeled when labeling using seed dictionaries
|
Map<String,Env> |
env
Environment for
TokenSequencePattern |
String |
externalFeatureWeightsFile |
static String |
extremedebug |
List<String> |
fillerWords |
String |
identifier
Save this run as ...
|
Pattern |
ignoreWordRegex
by default doesn't ignore anything.
|
boolean |
includeExternalFeatures |
SentenceIndex |
invertedIndex |
Class<? extends SentenceIndex> |
invertedIndexClass |
String |
invertedIndexDirectory
Where the inverted index (either in memory or lucene) is stored
|
boolean |
justify |
boolean |
loadInvertedIndex
You can load the inverted index using this file.
|
double |
LRSigma
Sigma for L2 regularization in Logisitic regression, if a classifier is
used to score phrases
|
boolean |
matchLowerCaseContext
Lowercase the context words/lemmas
|
int |
maxExtractNumWords
Maximum number of words to learn
|
static String |
minimaldebug |
int |
minLen4FuzzyForPattern
Minimum length of words that can be matched fuzzily
|
int |
minPosPhraseSupportForPat
Remove patterns that have number of positive words less than this.
|
int |
minUnlabPhraseSupportForPat
Remove patterns that have number of unlabeled words is less than this.
|
Integer |
numIterationsForPatterns
Maximum number of iterations to run
|
int |
numPatterns
Maximum number of patterns learned in each iteration
|
int |
numThreads
Number of threads
|
int |
numWordsCompound |
int |
numWordsToAdd
Number of words to learn in each iteration
|
String |
otherSemanticClassesFiles
List of dictionary phrases that are negative for all labels to be learned.
|
String |
outDir
The output directory where the justifications of learning patterns and
phrases would be saved.
|
ConcurrentHashIndex<SurfacePattern> |
patternIndex |
GetPatternsFromDataMultiClass.PatternScoring |
patternScoring
Pattern Scoring mechanism.
|
double |
perSelectNeg
These are used to learn weights for features if using logistic regression.
|
double |
perSelectRand
These are used to learn weights for features if using logistic regression.
|
boolean |
removeOverLappingLabelsFromSeed
Keeps only one label for each token, whichever has the longest
|
boolean |
removePhrasesWithStopWords |
boolean |
removeStopWordsFromSelectedPhrases |
boolean |
restrictToMatched
Currently, does not work correctly.
|
boolean |
saveInvertedIndex
You can save the inverted index.
|
boolean |
sqrtPatScore
If score for a pattern is square rooted
|
String |
stopWordsPatternFiles
Words that are not learned.
|
ConstantsAndVariables.PatternForEachTokenWay |
storePatsForEachToken |
String |
targetAllowedNERs
Allowed NERs for labels.
|
String |
targetAllowedTagsInitialsStr
Initials of all POS tags to use if
usePOS4Pattern is true, separated by comma. |
double |
thresholdNumPatternsApplied |
double |
thresholdSelectPattern
Threshold for learning a pattern
|
double |
thresholdWordExtract |
boolean |
tuneThresholdKeepRunning
Reduce pattern threshold (=0.8*current_value) to extract as many patterns
as possible (still restricted by
numPatterns ) |
boolean |
useContextNERRestriction
If the NER tag of the context tokens is not the background symbol,
generalize the token with the NER tag
|
boolean |
useLemmaContextTokens
Use lemma instead of words for the context tokens
|
boolean |
useMatchingPhrase
Use the actual dictionary matching phrase(s) instead of the token word or
lemma in calculating the stats
|
boolean |
useOtherLabelsWordsasNegative
use the seed dictionaries and the new words learned for the other labels in
the previous iterations as negative
|
boolean |
usePatternEvalDomainNgram
|
boolean |
usePatternEvalEditDistOther
|
boolean |
usePatternEvalEditDistSame
|
boolean |
usePatternEvalGoogleNgram
|
boolean |
usePatternEvalSemanticOdds
|
boolean |
usePatternEvalWordClass
|
boolean |
usePatternEvalWordShape
|
boolean |
usePatternResultAsLabel
Label words that are learned so that in further iterations we have more
information
|
boolean |
usePhraseEvalDomainNgram
use domain tf-idf for learning phrases
|
boolean |
usePhraseEvalEditDistOther
Edit distance between this phrase and other phrases in other dictionaries
|
boolean |
usePhraseEvalEditDistSame
Edit distance between this phrase and the other phrases in the label
dictionary
|
boolean |
usePhraseEvalGoogleNgram
use google tf-idf for learning phrases
|
boolean |
usePhraseEvalPatWtByFreq
use \sum_allpat pattern_wt_that_extracted_phrase/phrase_freq for learning
phrases
|
boolean |
usePhraseEvalSemanticOdds
odds of the phrase freq in the label dictionary vs other dictionaries
|
boolean |
usePhraseEvalWordClass
Only works if you have single label.
|
boolean |
usePhraseEvalWordShape |
boolean |
useTargetNERRestriction
Add NER restriction to the target phrase in the patterns
|
boolean |
useTargetParserParentRestriction
Adds the parent's tag from the parse tree to the target phrase in the patterns
|
String |
wordIgnoreRegex
Do not learn phrases that match this regex.
|
edu.stanford.nlp.patterns.surface.GetPatternsFromDataMultiClass.WordScoring |
wordScoring |
boolean |
writeMatchedTokensFiles |
Constructor and Description |
---|
ConstantsAndVariables(Properties props,
Map<String,Set<String>> labelDictionary,
Map<String,Class<? extends TypesafeMap.Key<String>>> answerClass,
Map<String,Class> generalizeClasses,
Map<String,Map<Class,Object>> ignoreClasses) |
ConstantsAndVariables(Properties props,
Set<String> labels,
Map<String,Class<? extends TypesafeMap.Key<String>>> answerClass) |
ConstantsAndVariables(Properties props,
Set<String> labels,
Map<String,Class<? extends TypesafeMap.Key<String>>> answerClass,
Map<String,Class> generalizeClasses) |
ConstantsAndVariables(Properties props,
Set<String> labels,
Map<String,Class<? extends TypesafeMap.Key<String>>> answerClass,
Map<String,Class> generalizeClasses,
Map<String,Map<Class,Object>> ignoreClasses) |
@Execution.Option(name="numIterationsForPatterns") public Integer numIterationsForPatterns
@Execution.Option(name="numPatterns") public int numPatterns
@Execution.Option(name="outDir") public String outDir
@Execution.Option(name="allPatternsDir") public String allPatternsDir
@Execution.Option(name="computeAllPatterns") public boolean computeAllPatterns
@Execution.Option(name="patternScoring") public GetPatternsFromDataMultiClass.PatternScoring patternScoring
GetPatternsFromDataMultiClass.PatternScoring
for options.@Execution.Option(name="thresholdSelectPattern") public double thresholdSelectPattern
@Execution.Option(name="restrictToMatched") public boolean restrictToMatched
@Execution.Option(name="usePatternResultAsLabel") public boolean usePatternResultAsLabel
@Execution.Option(name="debug") public int debug
@Execution.Option(name="identifier") public String identifier
@Execution.Option(name="useMatchingPhrase") public boolean useMatchingPhrase
@Execution.Option(name="tuneThresholdKeepRunning") public boolean tuneThresholdKeepRunning
numPatterns
)@Execution.Option(name="maxExtractNumWords") public int maxExtractNumWords
@Execution.Option(name="useOtherLabelsWordsasNegative") public boolean useOtherLabelsWordsasNegative
@Execution.Option(name="useLemmaContextTokens") public boolean useLemmaContextTokens
@Execution.Option(name="matchLowerCaseContext") public boolean matchLowerCaseContext
@Execution.Option(name="useTargetNERRestriction") public boolean useTargetNERRestriction
@Execution.Option(name="targetAllowedTagsInitialsStr") public String targetAllowedTagsInitialsStr
usePOS4Pattern
is true, separated by comma.@Execution.Option(name="targetAllowedNERs") public String targetAllowedNERs
useTargetNERRestriction
flag should be true@Execution.Option(name="useTargetParserParentRestriction") public boolean useTargetParserParentRestriction
@Execution.Option(name="useContextNERRestriction") public boolean useContextNERRestriction
@Execution.Option(name="numWordsToAdd") public int numWordsToAdd
@Execution.Option(name="thresholdNumPatternsApplied") public double thresholdNumPatternsApplied
@Execution.Option(name="wordScoring") public edu.stanford.nlp.patterns.surface.GetPatternsFromDataMultiClass.WordScoring wordScoring
@Execution.Option(name="thresholdWordExtract") public double thresholdWordExtract
public boolean justify
@Execution.Option(name="LRSigma") public double LRSigma
@Execution.Option(name="englishWordsFiles") public String englishWordsFiles
@Execution.Option(name="commonWordsPatternFiles") public String commonWordsPatternFiles
removePhrasesWithStopWords
or
removeStopWordsFromSelectedPhrases
is true. Also, these words
are considered negative when scoring a pattern (similar to
othersemanticclasses).@Execution.Option(name="otherSemanticClassesFiles") public String otherSemanticClassesFiles
@Execution.Option(name="minLen4FuzzyForPattern") public int minLen4FuzzyForPattern
@Execution.Option(name="wordIgnoreRegex") public String wordIgnoreRegex
@Execution.Option(name="numThreads") public int numThreads
@Execution.Option(name="stopWordsPatternFiles", gloss="stop words") public String stopWordsPatternFiles
CreatePatterns
is true.public Map<String,Env> env
TokenSequencePattern
public Pattern ignoreWordRegex
@Execution.Option(name="removeStopWordsFromSelectedPhrases") public boolean removeStopWordsFromSelectedPhrases
@Execution.Option(name="removePhrasesWithStopWords") public boolean removePhrasesWithStopWords
@Execution.Option(name="includeExternalFeatures") public boolean includeExternalFeatures
@Execution.Option(name="externalFeatureWeightsFile") public String externalFeatureWeightsFile
@Execution.Option(name="doNotApplyPatterns") public boolean doNotApplyPatterns
@Execution.Option(name="numWordsCompound") public int numWordsCompound
@Execution.Option(name="sqrtPatScore") public boolean sqrtPatScore
@Execution.Option(name="minUnlabPhraseSupportForPat") public int minUnlabPhraseSupportForPat
@Execution.Option(name="minPosPhraseSupportForPat") public int minPosPhraseSupportForPat
@Execution.Option(name="addIndvWordsFromPhrasesExceptLastAsNeg") public boolean addIndvWordsFromPhrasesExceptLastAsNeg
@Execution.Option(name="invertedIndexClass", gloss="another option is Lucene backed, which is not included in the CoreNLP release. Contact us to get a copy (distributed under Apache License).") public Class<? extends SentenceIndex> invertedIndexClass
@Execution.Option(name="invertedIndexDirectory") public String invertedIndexDirectory
@Execution.Option(name="clubNeighboringLabeledWords") public boolean clubNeighboringLabeledWords
@Execution.Option(name="removeOverLappingLabelsFromSeed") public boolean removeOverLappingLabelsFromSeed
@Execution.Option(name="usePhraseEvalWordClass") public boolean usePhraseEvalWordClass
@Execution.Option(name="usePhraseEvalGoogleNgram") public boolean usePhraseEvalGoogleNgram
@Execution.Option(name="usePhraseEvalDomainNgram") public boolean usePhraseEvalDomainNgram
@Execution.Option(name="usePhraseEvalPatWtByFreq") public boolean usePhraseEvalPatWtByFreq
@Execution.Option(name="usePhraseEvalSemanticOdds") public boolean usePhraseEvalSemanticOdds
@Execution.Option(name="usePhraseEvalEditDistSame") public boolean usePhraseEvalEditDistSame
@Execution.Option(name="usePhraseEvalEditDistOther") public boolean usePhraseEvalEditDistOther
@Execution.Option(name="usePhraseEvalWordShape") public boolean usePhraseEvalWordShape
@Execution.Option(name="usePatternEvalWordClass") public boolean usePatternEvalWordClass
@Execution.Option(name="usePatternEvalWordShape") public boolean usePatternEvalWordShape
@Execution.Option(name="usePatternEvalGoogleNgram") public boolean usePatternEvalGoogleNgram
@Execution.Option(name="usePatternEvalDomainNgram") public boolean usePatternEvalDomainNgram
@Execution.Option(name="usePatternEvalSemanticOdds") public boolean usePatternEvalSemanticOdds
@Execution.Option(name="usePatternEvalEditDistSame") public boolean usePatternEvalEditDistSame
@Execution.Option(name="usePatternEvalEditDistOther") public boolean usePatternEvalEditDistOther
@Execution.Option(name="perSelectRand") public double perSelectRand
@Execution.Option(name="perSelectNeg") public double perSelectNeg
@Execution.Option(name="doNotExtractPhraseAnyWordLabeledOtherClass") public boolean doNotExtractPhraseAnyWordLabeledOtherClass
@Execution.Option(name="saveInvertedIndex") public boolean saveInvertedIndex
invertedIndexDirectory
if given.@Execution.Option(name="loadInvertedIndex") public boolean loadInvertedIndex
@Execution.Option(name="storePatsForEachToken", gloss="used for storing patterns in PSQL") public ConstantsAndVariables.PatternForEachTokenWay storePatsForEachToken
public String backgroundSymbol
public SentenceIndex invertedIndex
public static String extremedebug
public static String minimaldebug
public ConcurrentHashIndex<SurfacePattern> patternIndex
@Execution.Option(name="batchProcessSents") public boolean batchProcessSents
@Execution.Option(name="writeMatchedTokensFiles") public boolean writeMatchedTokensFiles
public ConstantsAndVariables(Properties props, Set<String> labels, Map<String,Class<? extends TypesafeMap.Key<String>>> answerClass, Map<String,Class> generalizeClasses, Map<String,Map<Class,Object>> ignoreClasses) throws IOException
IOException
public ConstantsAndVariables(Properties props, Map<String,Set<String>> labelDictionary, Map<String,Class<? extends TypesafeMap.Key<String>>> answerClass, Map<String,Class> generalizeClasses, Map<String,Map<Class,Object>> ignoreClasses) throws IOException
IOException
public ConstantsAndVariables(Properties props, Set<String> labels, Map<String,Class<? extends TypesafeMap.Key<String>>> answerClass) throws IOException
IOException
public ConstantsAndVariables(Properties props, Set<String> labels, Map<String,Class<? extends TypesafeMap.Key<String>>> answerClass, Map<String,Class> generalizeClasses) throws IOException
IOException
public ConcurrentHashIndex<SurfacePattern> getPatternIndex()
public void setPatternIndex(ConcurrentHashIndex<SurfacePattern> patternIndex)
public void setUp(Properties props) throws IOException
IOException
public void setWordShapesForLabels(Map<String,Counter<String>> wordShapesForLabels)
public Pair<String,Double> getEditDistanceFromThisClass(String label, String ph, int minLen)
public Pair<String,Double> getEditDistanceFromOtherSemanticClasses(String ph, int minLen)
public double getEditDistanceFromEng(String ph, int minLen)
public ConcurrentHashMap<String,Double> getEditDistanceFromEnglishWords()
public ConcurrentHashMap<String,String> getEditDistanceFromEnglishWordsMatches()
public double getEditDistanceScoresOtherClass(String g)
public double getEditDistanceScoresOtherClassThreshold(String g)
g
- public double getEditDistanceScoresThisClassThreshold(String label, String g)
public void setGeneralWordClassClusters(Map<String,Integer> generalWordClassClusters)
public Map<String,Class<? extends TypesafeMap.Key<String>>> getAnswerClass()
public Map<String,Map<Class,Object>> getIgnoreWordswithClassesDuringSelection()
public Counter<SurfacePattern> transformPatternsToSurface(Counter<Integer> pats)
public Counter<Integer> transformPatternsToIndex(Counter<SurfacePattern> pats)
public Integer transformPatternToIndex(SurfacePattern pat)