public class ChunkAnnotationUtils
extends java.lang.Object
Modifier and Type | Method and Description |
---|---|
static void |
annotateChunk(CoreMap annotation,
java.lang.Class newAnnotationKey,
java.lang.Class aggrKey,
CoreMapAttributeAggregator aggregator) |
static void |
annotateChunk(CoreMap chunk,
java.util.List<CoreLabel> tokens,
int tokenStartIndex,
int tokenEndIndex,
int totalTokenOffset)
Annotates a CoreMap representing a chunk with basic chunk information.
|
static void |
annotateChunk(CoreMap chunk,
java.util.Map<java.lang.String,java.lang.String> attributes) |
static void |
annotateChunks(java.util.List<? extends CoreMap> chunks,
int start,
int end,
java.util.Map<java.lang.String,java.lang.String> attributes) |
static void |
annotateChunks(java.util.List<? extends CoreMap> chunks,
java.util.Map<java.lang.String,java.lang.String> attributes) |
static void |
annotateChunkText(CoreMap chunk,
java.lang.Class tokenTextKey)
Annotates a CoreMap representing a chunk with text information
TextAnnotation - String representing tokens in this chunks (token text separated by space)
|
static boolean |
annotateChunkText(CoreMap chunk,
CoreMap origAnnotation)
Annotates a CoreMap representing a chunk with text information
TextAnnotation - String extracted from the origAnnotation using character offset information for this chunk
|
static void |
annotateChunkTokens(CoreMap chunk,
java.lang.Class tokenChunkKey,
java.lang.Class tokenLabelKey)
Annotates tokens in chunk.
|
static <T extends CoreMap> |
appendCoreMap(java.util.List<T> res,
CoreMap cm,
java.lang.String text,
int start,
int end,
CoreTokenFactory<T> factory) |
static boolean |
checkOffsets(CoreMap docAnnotation)
Checks if offsets of doc and sentence matches.
|
static void |
copyUnsetAnnotations(CoreMap src,
CoreMap dest)
Copies annotation over to this CoreMap if not already set.
|
static <T extends CoreMap> |
createCoreMap(CoreMap cm,
java.lang.String text,
int start,
int end,
CoreTokenFactory<T> factory) |
static boolean |
fixChunkSentenceBoundaries(CoreMap docAnnotation,
java.util.List<IntPair> chunkCharOffsets)
Give an list of character offsets for chunk, fix sentence splitting
so sentences doesn't break the chunks.
|
static boolean |
fixChunkSentenceBoundaries(CoreMap docAnnotation,
java.util.List<IntPair> chunkCharOffsets,
boolean offsetsAreNotSorted,
boolean extendedFixSentence,
boolean moreExtendedFixSentence)
Give an list of character offsets for chunk, fix sentence splitting
so sentences doesn't break the chunks.
|
static boolean |
fixChunkTokenBoundaries(CoreMap docAnnotation,
java.util.List<IntPair> chunkCharOffsets)
Give an list of character offsets for chunk, fix tokenization so tokenization occurs at
boundary of chunks.
|
static boolean |
fixTokenOffsets(CoreMap docAnnotation)
Fix token offsets of sentences to match those in the document (assumes tokens are shared)
sentence token indices may not match document token list if certain html elements are ignored.
|
static Annotation |
getAnnotatedChunk(CoreMap annotation,
int tokenStartIndex,
int tokenEndIndex)
Create a new chunk Annotation with basic chunk information
CharacterOffsetBeginAnnotation - set to CharacterOffsetBeginAnnotation of first token in chunk
CharacterOffsetEndAnnotation - set to CharacterOffsetEndAnnotation of last token in chunk
TokensAnnotation - List of tokens in this chunk
TokenBeginAnnotation - Index of first token in chunk (index in original list of tokens)
tokenStartIndex + annotation's TokenBeginAnnotation
TokenEndAnnotation - Index of last token in chunk (index in original list of tokens)
tokenEndIndex + annotation's TokenBeginAnnotation
TextAnnotation - String extracted from the origAnnotation using character offset information for this chunk
|
static Annotation |
getAnnotatedChunk(CoreMap annotation,
int tokenStartIndex,
int tokenEndIndex,
java.lang.Class tokenChunkKey,
java.lang.Class tokenLabelKey)
Create a new chunk Annotation with basic chunk information
CharacterOffsetBeginAnnotation - set to CharacterOffsetBeginAnnotation of first token in chunk
CharacterOffsetEndAnnotation - set to CharacterOffsetEndAnnotation of last token in chunk
TokensAnnotation - List of tokens in this chunk
TokenBeginAnnotation - Index of first token in chunk (index in original list of tokens)
tokenStartIndex + annotation's TokenBeginAnnotation
TokenEndAnnotation - Index of last token in chunk (index in original list of tokens)
tokenEndIndex + annotation's TokenBeginAnnotation
TextAnnotation - String extracted from the origAnnotation using character offset information for this chunk
|
static Annotation |
getAnnotatedChunk(java.util.List<CoreLabel> tokens,
int tokenStartIndex,
int tokenEndIndex,
int totalTokenOffset)
Create a new chunk Annotation with basic chunk information.
|
static Annotation |
getAnnotatedChunk(java.util.List<CoreLabel> tokens,
int tokenStartIndex,
int tokenEndIndex,
int totalTokenOffset,
java.lang.Class tokenChunkKey,
java.lang.Class tokenTextKey,
java.lang.Class tokenLabelKey)
Create a new chunk Annotation with basic chunk information.
|
static java.util.List<CoreMap> |
getAnnotatedChunksUsingSortedCharOffsets(CoreMap annotation,
java.util.List<IntPair> charOffsets) |
static java.util.List<CoreMap> |
getAnnotatedChunksUsingSortedCharOffsets(CoreMap annotation,
java.util.List<IntPair> charOffsets,
boolean charOffsetIsRelative,
java.lang.Class tokenChunkKey,
java.lang.Class tokenLabelKey,
boolean allowPartialTokens)
Create a list of new chunk Annotation with basic chunk information.
|
static CoreMap |
getAnnotatedChunkUsingCharOffsets(CoreMap annotation,
int charOffsetStart,
int charOffsetEnd)
Returns a chunk annotation based on char offsets.
|
static Interval<java.lang.Integer> |
getChunkOffsetsUsingCharOffsets(java.util.List<? extends CoreMap> chunkList,
int charStart,
int charEnd)
Return chunk offsets
|
static CoreMap |
getMergedChunk(java.util.List<? extends CoreMap> chunkList,
int chunkIndexStart,
int chunkIndexEnd,
java.util.Map<java.lang.Class,CoreMapAttributeAggregator> aggregators,
CoreLabelTokenFactory tokenFactory)
Create chunk that is merged from chunkIndexStart to chunkIndexEnd (exclusive)
|
static CoreMap |
getMergedChunk(java.util.List<? extends CoreMap> chunkList,
java.lang.String origText,
int chunkIndexStart,
int chunkIndexEnd,
CoreLabelTokenFactory tokenFactory)
Create chunk that is merged from chunkIndexStart to chunkIndexEnd (exclusive).
|
static java.lang.String |
getTokenText(java.util.List<? extends CoreMap> tokens,
java.lang.Class tokenTextKey) |
static java.lang.String |
getTokenText(java.util.List<? extends CoreMap> tokens,
java.lang.Class tokenTextKey,
java.lang.String delimiter) |
static boolean |
hasCharacterOffsets(CoreMap chunk) |
static void |
mergeChunks(java.util.List<CoreMap> chunkList,
java.lang.String origText,
int chunkIndexStart,
int chunkIndexEnd)
Merge chunks from chunkIndexStart to chunkIndexEnd (exclusive) and replace them in the list.
|
static <T extends CoreMap> |
splitCoreMap(java.util.regex.Pattern p,
boolean includeMatched,
CoreMap cm,
CoreTokenFactory<T> factory) |
public static boolean checkOffsets(CoreMap docAnnotation)
docAnnotation
- The document Annotation to analyzepublic static boolean fixTokenOffsets(CoreMap docAnnotation)
docAnnotation
- The document Annotation to analyzepublic static void copyUnsetAnnotations(CoreMap src, CoreMap dest)
public static boolean fixChunkTokenBoundaries(CoreMap docAnnotation, java.util.List<IntPair> chunkCharOffsets)
docAnnotation
- chunkCharOffsets
- public static CoreMap getMergedChunk(java.util.List<? extends CoreMap> chunkList, java.lang.String origText, int chunkIndexStart, int chunkIndexEnd, CoreLabelTokenFactory tokenFactory)
chunkList
- - List of chunksorigText
- - Text from which to extract chunk textchunkIndexStart
- - Index of first chunk to mergechunkIndexEnd
- - Index of last chunk to merge (exclusive)tokenFactory
- - factory for creating tokens (if we want to get a merged corelabel instead of something random)public static CoreMap getMergedChunk(java.util.List<? extends CoreMap> chunkList, int chunkIndexStart, int chunkIndexEnd, java.util.Map<java.lang.Class,CoreMapAttributeAggregator> aggregators, CoreLabelTokenFactory tokenFactory)
chunkList
- - List of chunkschunkIndexStart
- - Index of first chunk to mergechunkIndexEnd
- - Index of last chunk to merge (exclusive)aggregators
- - AggregatorstokenFactory
- - factory for creating tokens (if we want to get a merged corelabel instead of something random)public static Interval<java.lang.Integer> getChunkOffsetsUsingCharOffsets(java.util.List<? extends CoreMap> chunkList, int charStart, int charEnd)
chunkList
- - List of chunkscharStart
- - character begin offsetcharEnd
- - character end offsetpublic static void mergeChunks(java.util.List<CoreMap> chunkList, java.lang.String origText, int chunkIndexStart, int chunkIndexEnd)
chunkList
- - List of chunksorigText
- - Text from which to extract chunk textchunkIndexStart
- - Index of first chunk to mergechunkIndexEnd
- - Index of last chunk to merge (exclusive)public static boolean fixChunkSentenceBoundaries(CoreMap docAnnotation, java.util.List<IntPair> chunkCharOffsets)
docAnnotation
- Document with sentenceschunkCharOffsets
- ordered pairs of different chunks that should appear in sentencespublic static boolean fixChunkSentenceBoundaries(CoreMap docAnnotation, java.util.List<IntPair> chunkCharOffsets, boolean offsetsAreNotSorted, boolean extendedFixSentence, boolean moreExtendedFixSentence)
docAnnotation
- Document with sentenceschunkCharOffsets
- ordered pairs of different chunks that should appear in sentencesoffsetsAreNotSorted
- Treat each pair of offsets as independent (look through all sentences again)extendedFixSentence
- Do extended sentence fixing based on some heuristicsmoreExtendedFixSentence
- Do even more extended sentence fixing based on some heuristicspublic static void annotateChunk(CoreMap chunk, java.util.List<CoreLabel> tokens, int tokenStartIndex, int tokenEndIndex, int totalTokenOffset)
chunk
- - CoreMap to be annotatedtokens
- - List of tokens to look for chunkstokenStartIndex
- - Index (relative to current list of tokens) at which this chunk startstokenEndIndex
- - Index (relative to current list of tokens) at which this chunk ends (not inclusive)totalTokenOffset
- - Index of tokens to offset bypublic static java.lang.String getTokenText(java.util.List<? extends CoreMap> tokens, java.lang.Class tokenTextKey)
public static java.lang.String getTokenText(java.util.List<? extends CoreMap> tokens, java.lang.Class tokenTextKey, java.lang.String delimiter)
public static void annotateChunkText(CoreMap chunk, java.lang.Class tokenTextKey)
chunk
- - CoreMap to be annotatedtokenTextKey
- - Key to use to find the token textpublic static boolean hasCharacterOffsets(CoreMap chunk)
public static boolean annotateChunkText(CoreMap chunk, CoreMap origAnnotation)
chunk
- - CoreMap to be annotatedorigAnnotation
- - Annotation from which to extract the text for this chunkpublic static void annotateChunkTokens(CoreMap chunk, java.lang.Class tokenChunkKey, java.lang.Class tokenLabelKey)
chunk
- - CoreMap representing chunk (should have TextAnnotation and TokensAnnotation)tokenChunkKey
- - If not null, each token is annotated with the chunk using this keytokenLabelKey
- - If not null, each token is annotated with the text associated with the chunk using this keypublic static Annotation getAnnotatedChunk(java.util.List<CoreLabel> tokens, int tokenStartIndex, int tokenEndIndex, int totalTokenOffset)
tokens
- - List of tokens to look for chunkstokenStartIndex
- - Index (relative to current list of tokens) at which this chunk startstokenEndIndex
- - Index (relative to current list of tokens) at which this chunk ends (not inclusive)totalTokenOffset
- - Index of tokens to offset bypublic static Annotation getAnnotatedChunk(java.util.List<CoreLabel> tokens, int tokenStartIndex, int tokenEndIndex, int totalTokenOffset, java.lang.Class tokenChunkKey, java.lang.Class tokenTextKey, java.lang.Class tokenLabelKey)
tokens
- - List of tokens to look for chunkstokenStartIndex
- - Index (relative to current list of tokens) at which this chunk startstokenEndIndex
- - Index (relative to current list of tokens) at which this chunk ends (not inclusive)totalTokenOffset
- - Index of tokens to offset bytokenChunkKey
- - If not null, each token is annotated with the chunk using this keytokenTextKey
- - Key to use to find the token texttokenLabelKey
- - If not null, each token is annotated with the text associated with the chunk using this keypublic static Annotation getAnnotatedChunk(CoreMap annotation, int tokenStartIndex, int tokenEndIndex)
annotation
- - Annotation from which to extract the text for this chunktokenStartIndex
- - Index (relative to current list of tokens) at which this chunk startstokenEndIndex
- - Index (relative to current list of tokens) at which this chunk ends (not inclusive)public static Annotation getAnnotatedChunk(CoreMap annotation, int tokenStartIndex, int tokenEndIndex, java.lang.Class tokenChunkKey, java.lang.Class tokenLabelKey)
annotation
- - Annotation from which to extract the text for this chunktokenStartIndex
- - Index (relative to current list of tokens) at which this chunk startstokenEndIndex
- - Index (relative to current list of tokens) at which this chunk ends (not inclusive)tokenChunkKey
- - If not null, each token is annotated with the chunk using this keytokenLabelKey
- - If not null, each token is annotated with the text associated with the chunk using this keypublic static CoreMap getAnnotatedChunkUsingCharOffsets(CoreMap annotation, int charOffsetStart, int charOffsetEnd)
annotation
- Annotation from which to extract the text for this chunkcharOffsetStart
- Start character offsetcharOffsetEnd
- End (not inclusive) character offsetnull
if no chunk matches offsets.public static java.util.List<CoreMap> getAnnotatedChunksUsingSortedCharOffsets(CoreMap annotation, java.util.List<IntPair> charOffsets)
public static java.util.List<CoreMap> getAnnotatedChunksUsingSortedCharOffsets(CoreMap annotation, java.util.List<IntPair> charOffsets, boolean charOffsetIsRelative, java.lang.Class tokenChunkKey, java.lang.Class tokenLabelKey, boolean allowPartialTokens)
annotation
- Annotation from which to extract the text for this chunkcharOffsets
- - List of start and end (not inclusive) character offsets
Note: assume char offsets are sorted and non-overlapping!!!charOffsetIsRelative
- - Whether the character offsets are relative to the current annotation or absolute offsetstokenChunkKey
- - If not null, each token is annotated with the chunk using this keytokenLabelKey
- - If not null, each token is annotated with the text associated with the chunk using this keyallowPartialTokens
- - Whether to allow partial tokens or notpublic static void annotateChunk(CoreMap annotation, java.lang.Class newAnnotationKey, java.lang.Class aggrKey, CoreMapAttributeAggregator aggregator)
public static void annotateChunk(CoreMap chunk, java.util.Map<java.lang.String,java.lang.String> attributes)
public static void annotateChunks(java.util.List<? extends CoreMap> chunks, int start, int end, java.util.Map<java.lang.String,java.lang.String> attributes)
public static void annotateChunks(java.util.List<? extends CoreMap> chunks, java.util.Map<java.lang.String,java.lang.String> attributes)
public static <T extends CoreMap> T createCoreMap(CoreMap cm, java.lang.String text, int start, int end, CoreTokenFactory<T> factory)
public static <T extends CoreMap> void appendCoreMap(java.util.List<T> res, CoreMap cm, java.lang.String text, int start, int end, CoreTokenFactory<T> factory)
public static <T extends CoreMap> java.util.List<T> splitCoreMap(java.util.regex.Pattern p, boolean includeMatched, CoreMap cm, CoreTokenFactory<T> factory)