L
- The type of the labels in the DatasetF
- The type of the features in the Datasetpublic abstract class GeneralDataset<L,F> extends java.lang.Object implements java.io.Serializable, java.lang.Iterable<RVFDatum<L,F>>
Dataset
and RVFDataset
.Modifier and Type | Field and Description |
---|---|
protected int[][] |
data |
Index<F> |
featureIndex |
Index<L> |
labelIndex |
protected int[] |
labels |
protected int |
size |
Constructor and Description |
---|
GeneralDataset() |
Modifier and Type | Method and Description |
---|---|
abstract void |
add(Datum<L,F> d) |
void |
addAll(java.lang.Iterable<? extends Datum<L,F>> data)
Adds all Datums in the given collection of data to this dataset
|
void |
applyFeatureCountThreshold(int k)
Applies a feature count threshold to the Dataset.
|
void |
applyFeatureMaxCountThreshold(int k)
Applies a max feature count threshold to the Dataset.
|
void |
clear()
Resets the Dataset so that it is empty and ready to collect data.
|
void |
clear(int numDatums)
Resets the Dataset so that it is empty and ready to collect data.
|
Index<F> |
featureIndex() |
int[][] |
getDataArray() |
abstract Datum<L,F> |
getDatum(int index) |
float[] |
getFeatureCounts()
Get the total count (over all data instances) of each feature
|
int[] |
getLabelsArray() |
abstract RVFDatum<L,F> |
getRVFDatum(int index) |
abstract double[][] |
getValuesArray() |
protected abstract void |
initialize(int numDatums)
This method takes care of resetting values of the dataset
such that it is empty with an initial capacity of numDatums.
|
java.util.Iterator<RVFDatum<L,F>> |
iterator() |
Index<L> |
labelIndex() |
java.util.Iterator<L> |
labelIterator()
Returns an iterator over the class labels of the Dataset
|
java.lang.String[] |
makeSvmLabelMap()
Maps our labels to labels that are compatible with svm_light
|
GeneralDataset<L,F> |
mapDataset(GeneralDataset<L,F> dataset) |
<L2> GeneralDataset<L2,F> |
mapDataset(GeneralDataset<L,F> dataset,
Index<L2> newLabelIndex,
java.util.Map<L,L2> labelMapping,
L2 defaultLabel) |
static <L,L2,F> Datum<L2,F> |
mapDatum(Datum<L,F> d,
java.util.Map<L,L2> labelMapping,
L2 defaultLabel) |
int |
numClasses() |
ClassicCounter<L> |
numDatumsPerLabel() |
int |
numFeatures() |
int |
numFeatureTokens()
returns the number of feature tokens in the Dataset.
|
int |
numFeatureTypes()
returns the number of distinct feature types in the Dataset.
|
void |
printSVMLightFormat()
Dumps the Dataset as a training/test file for SVMLight.
|
void |
printSVMLightFormat(java.io.PrintWriter pw)
Print SVM Light Format file.
|
void |
randomize(int randomSeed)
Randomizes the data array in place.
|
GeneralDataset<L,F> |
sampleDataset(int randomSeed,
double sampleFrac,
boolean sampleWithReplacement) |
int |
size()
Returns the number of examples (
Datum s) in the Dataset. |
abstract Pair<GeneralDataset<L,F>,GeneralDataset<L,F>> |
split(double p) |
abstract Pair<GeneralDataset<L,F>,GeneralDataset<L,F>> |
split(int start,
int end) |
abstract void |
summaryStatistics()
Print some statistics summarizing the dataset
|
protected void |
trimData() |
protected void |
trimLabels() |
protected double[][] |
trimToSize(double[][] i) |
protected int[] |
trimToSize(int[] i) |
protected int[][] |
trimToSize(int[][] i) |
public int numFeatures()
public int numClasses()
public int[] getLabelsArray()
public int[][] getDataArray()
public abstract double[][] getValuesArray()
public void clear()
public void clear(int numDatums)
numDatums
- initial capacity of datasetprotected abstract void initialize(int numDatums)
numDatums
- initial capacity of datasetpublic float[] getFeatureCounts()
public void applyFeatureCountThreshold(int k)
public void applyFeatureMaxCountThreshold(int k)
public int numFeatureTokens()
public int numFeatureTypes()
public void addAll(java.lang.Iterable<? extends Datum<L,F>> data)
data
- collection of datums you would like to add to the datasetpublic abstract Pair<GeneralDataset<L,F>,GeneralDataset<L,F>> split(int start, int end)
public abstract Pair<GeneralDataset<L,F>,GeneralDataset<L,F>> split(double p)
public int size()
Datum
s) in the Dataset.protected void trimData()
protected void trimLabels()
protected int[] trimToSize(int[] i)
protected int[][] trimToSize(int[][] i)
protected double[][] trimToSize(double[][] i)
public void randomize(int randomSeed)
randomSeed
- public GeneralDataset<L,F> sampleDataset(int randomSeed, double sampleFrac, boolean sampleWithReplacement)
public abstract void summaryStatistics()
public java.util.Iterator<L> labelIterator()
public GeneralDataset<L,F> mapDataset(GeneralDataset<L,F> dataset)
dataset
- public static <L,L2,F> Datum<L2,F> mapDatum(Datum<L,F> d, java.util.Map<L,L2> labelMapping, L2 defaultLabel)
public <L2> GeneralDataset<L2,F> mapDataset(GeneralDataset<L,F> dataset, Index<L2> newLabelIndex, java.util.Map<L,L2> labelMapping, L2 defaultLabel)
dataset
- public void printSVMLightFormat()
public java.lang.String[] makeSvmLabelMap()
public void printSVMLightFormat(java.io.PrintWriter pw)
public ClassicCounter<L> numDatumsPerLabel()