|
|||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | ||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Objectedu.stanford.nlp.classify.GeneralDataset<L,F>
edu.stanford.nlp.classify.Dataset<L,F>
L
- Label typeF
- Feature typepublic class Dataset<L,F>
An interfacing class for ClassifierFactory
that incrementally
builds a more memory-efficient representation of a List
of
Datum
objects for the purposes of training a Classifier
with a ClassifierFactory
.
and #getL1NormalizedTFIDFDataset()
Field Summary |
---|
Fields inherited from class edu.stanford.nlp.classify.GeneralDataset |
---|
data, featureIndex, labelIndex, labels, size |
Constructor Summary | |
---|---|
Dataset()
|
|
Dataset(Index<F> featureIndex,
Index<L> labelIndex)
|
|
Dataset(Index<L> labelIndex,
int[] labels,
Index<F> featureIndex,
int[][] data)
Constructor that fully specifies a Dataset. |
|
Dataset(Index<L> labelIndex,
int[] labels,
Index<F> featureIndex,
int[][] data,
int size)
Constructor that fully specifies a Dataset. |
|
Dataset(int numDatums)
|
|
Dataset(int numDatums,
Index<F> featureIndex,
Index<L> labelIndex)
|
Method Summary | |
---|---|
void |
add(java.util.Collection<F> features,
L label)
|
void |
add(java.util.Collection<F> features,
L label,
boolean addNewFeatures)
|
void |
add(Datum<L,F> d)
|
void |
add(int[] features,
int label)
Adds a datums defined by feature indices and label index Careful with this one! Make sure that all indices are valid! |
protected void |
addFeatureIndices(int[] features)
|
protected void |
addFeatures(java.util.Collection<F> features)
|
protected void |
addFeatures(java.util.Collection<F> features,
boolean addNewFeatures)
|
protected void |
addLabel(L label)
|
protected void |
addLabelIndex(int label)
|
void |
applyFeatureCountThreshold(java.util.List<Pair<java.util.regex.Pattern,java.lang.Integer>> thresholds)
Applies feature count thresholds to the Dataset. |
void |
changeFeatureIndex(Index<F> newFeatureIndex)
|
void |
changeLabelIndex(Index<L> newLabelIndex)
|
protected void |
ensureSize()
|
Datum<L,F> |
getDatum(int index)
|
Counter<F> |
getFeatureCounter()
Get Number of datums a given feature appears in. |
double[] |
getInformationGains()
|
RVFDataset<L,F> |
getL1NormalizedTFIDFDataset()
Method to convert this dataset to RVFDataset using L1-normalized TF-IDF features |
RVFDatum<L,F> |
getL1NormalizedTFIDFDatum(Datum<L,F> datum,
Counter<F> featureDocCounts)
Method to convert features from counts to L1-normalized TFIDF based features |
Dataset<L,F> |
getRandomSubDataset(double p,
int seed)
|
RVFDatum<L,F> |
getRVFDatum(int index)
|
double[][] |
getValuesArray()
|
protected void |
initialize(int numDatums)
This method takes care of resetting values of the dataset such that it is empty with an initial capacity of numDatums. |
void |
printFullFeatureMatrix(java.io.PrintWriter pw)
prints the full feature matrix in tab-delimited form. |
void |
printSparseFeatureMatrix()
prints the sparse feature matrix using printSparseFeatureMatrix()
to System.out . |
void |
printSparseFeatureMatrix(java.io.PrintWriter pw)
prints a sparse feature matrix representation of the Dataset. |
static void |
printSVMLightFormat(java.io.PrintWriter pw,
ClassicCounter<java.lang.Integer> c,
int classNo)
Need to sort the counter by feature keys and dump it |
static Dataset<java.lang.String,java.lang.String> |
readSVMLightFormat(java.lang.String filename)
Constructs a Dataset by reading in a file in SVM light format. |
static Dataset<java.lang.String,java.lang.String> |
readSVMLightFormat(java.lang.String filename,
Index<java.lang.String> featureIndex,
Index<java.lang.String> labelIndex)
Constructs a Dataset by reading in a file in SVM light format. |
static Dataset<java.lang.String,java.lang.String> |
readSVMLightFormat(java.lang.String filename,
Index<java.lang.String> featureIndex,
Index<java.lang.String> labelIndex,
java.util.List<java.lang.String> lines)
Constructs a Dataset by reading in a file in SVM light format. |
static Dataset<java.lang.String,java.lang.String> |
readSVMLightFormat(java.lang.String filename,
java.util.List<java.lang.String> lines)
Constructs a Dataset by reading in a file in SVM light format. |
void |
selectFeatures(int numFeatures,
double[] scores)
Generic method to select features based on the feature scores vector provided as an argument. |
void |
selectFeaturesBinaryInformationGain(int numFeatures)
|
Pair<GeneralDataset<L,F>,GeneralDataset<L,F>> |
split(double percentDev)
|
Pair<GeneralDataset<L,F>,GeneralDataset<L,F>> |
split(int start,
int end)
|
void |
summaryStatistics()
Prints some summary statistics to stderr for the Dataset. |
static Datum<java.lang.String,java.lang.String> |
svmLightLineToDatum(java.lang.String l)
|
java.lang.String |
toString()
|
java.lang.String |
toSummaryStatistics()
|
java.lang.String |
toSummaryString()
|
void |
updateLabels(int[] labels)
|
Methods inherited from class edu.stanford.nlp.classify.GeneralDataset |
---|
addAll, applyFeatureCountThreshold, applyFeatureMaxCountThreshold, clear, clear, featureIndex, getDataArray, getFeatureCounts, getLabelsArray, iterator, labelIndex, labelIterator, makeSvmLabelMap, mapDataset, mapDataset, mapDatum, numClasses, numFeatures, numFeatureTokens, numFeatureTypes, printSVMLightFormat, printSVMLightFormat, randomize, sampleDataset, size, trimData, trimLabels, trimToSize, trimToSize, trimToSize |
Methods inherited from class java.lang.Object |
---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, wait, wait, wait |
Constructor Detail |
---|
public Dataset()
public Dataset(int numDatums)
public Dataset(int numDatums, Index<F> featureIndex, Index<L> labelIndex)
public Dataset(Index<F> featureIndex, Index<L> labelIndex)
public Dataset(Index<L> labelIndex, int[] labels, Index<F> featureIndex, int[][] data)
public Dataset(Index<L> labelIndex, int[] labels, Index<F> featureIndex, int[][] data, int size)
Method Detail |
---|
public Pair<GeneralDataset<L,F>,GeneralDataset<L,F>> split(double percentDev)
split
in class GeneralDataset<L,F>
public Pair<GeneralDataset<L,F>,GeneralDataset<L,F>> split(int start, int end)
split
in class GeneralDataset<L,F>
public Dataset<L,F> getRandomSubDataset(double p, int seed)
public double[][] getValuesArray()
getValuesArray
in class GeneralDataset<L,F>
public static Dataset<java.lang.String,java.lang.String> readSVMLightFormat(java.lang.String filename)
public static Dataset<java.lang.String,java.lang.String> readSVMLightFormat(java.lang.String filename, java.util.List<java.lang.String> lines)
public static Dataset<java.lang.String,java.lang.String> readSVMLightFormat(java.lang.String filename, Index<java.lang.String> featureIndex, Index<java.lang.String> labelIndex)
public static Dataset<java.lang.String,java.lang.String> readSVMLightFormat(java.lang.String filename, Index<java.lang.String> featureIndex, Index<java.lang.String> labelIndex, java.util.List<java.lang.String> lines)
public static Datum<java.lang.String,java.lang.String> svmLightLineToDatum(java.lang.String l)
public Counter<F> getFeatureCounter()
public RVFDatum<L,F> getL1NormalizedTFIDFDatum(Datum<L,F> datum, Counter<F> featureDocCounts)
datum
- with a collection of features.featureDocCounts
- a counter of doc-count for each feature.
public RVFDataset<L,F> getL1NormalizedTFIDFDataset()
public void add(Datum<L,F> d)
add
in class GeneralDataset<L,F>
public void add(java.util.Collection<F> features, L label)
public void add(java.util.Collection<F> features, L label, boolean addNewFeatures)
public void add(int[] features, int label)
features
- label
- protected void ensureSize()
protected void addLabel(L label)
protected void addLabelIndex(int label)
protected void addFeatures(java.util.Collection<F> features)
protected void addFeatures(java.util.Collection<F> features, boolean addNewFeatures)
protected void addFeatureIndices(int[] features)
protected final void initialize(int numDatums)
GeneralDataset
initialize
in class GeneralDataset<L,F>
numDatums
- initial capacity of datasetpublic Datum<L,F> getDatum(int index)
getDatum
in class GeneralDataset<L,F>
public RVFDatum<L,F> getRVFDatum(int index)
getRVFDatum
in class GeneralDataset<L,F>
public void summaryStatistics()
summaryStatistics
in class GeneralDataset<L,F>
public java.lang.String toSummaryStatistics()
public void applyFeatureCountThreshold(java.util.List<Pair<java.util.regex.Pattern,java.lang.Integer>> thresholds)
thresholds
- a list of pattern, threshold pairspublic void printFullFeatureMatrix(java.io.PrintWriter pw)
public void printSparseFeatureMatrix()
printSparseFeatureMatrix()
to System.out
.
public void printSparseFeatureMatrix(java.io.PrintWriter pw)
Object.toString()
representations of features.
public void changeLabelIndex(Index<L> newLabelIndex)
public void changeFeatureIndex(Index<F> newFeatureIndex)
public void selectFeaturesBinaryInformationGain(int numFeatures)
public void selectFeatures(int numFeatures, double[] scores)
numFeatures
- number of features to be selected.scores
- a vector of size total number of features in the data.public double[] getInformationGains()
public void updateLabels(int[] labels)
public java.lang.String toString()
toString
in class java.lang.Object
public java.lang.String toSummaryString()
public static void printSVMLightFormat(java.io.PrintWriter pw, ClassicCounter<java.lang.Integer> c, int classNo)
|
|||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | ||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |