edu.stanford.nlp.pipeline
Class ChunkAnnotationUtils

java.lang.Object
  extended by edu.stanford.nlp.pipeline.ChunkAnnotationUtils

public class ChunkAnnotationUtils
extends Object

Utility functions for annotating chunks

Author:
Angel Chang

Constructor Summary
ChunkAnnotationUtils()
           
 
Method Summary
static void annotateChunk(CoreMap annotation, Class newAnnotationKey, Class aggrKey, CoreMapAttributeAggregator aggregator)
           
static void annotateChunk(CoreMap chunk, List<CoreLabel> tokens, int tokenStartIndex, int tokenEndIndex, int totalTokenOffset)
          Annotates a CoreMap representing a chunk with basic chunk information CharacterOffsetBeginAnnotation - set to CharacterOffsetBeginAnnotation of first token in chunk CharacterOffsetEndAnnotation - set to CharacterOffsetEndAnnotation of last token in chunk TokensAnnotation - List of tokens in this chunk TokenBeginAnnotation - Index of first token in chunk (index in original list of tokens) tokenStartIndex + totalTokenOffset TokenEndAnnotation - Index of last token in chunk (index in original list of tokens) tokenEndIndex + totalTokenOffset
static void annotateChunk(CoreMap chunk, Map<String,String> attributes)
           
static void annotateChunks(List<? extends CoreMap> chunks, int start, int end, Map<String,String> attributes)
           
static void annotateChunks(List<? extends CoreMap> chunks, Map<String,String> attributes)
           
static void annotateChunkText(CoreMap chunk, Class tokenTextKey)
          Annotates a CoreMap representing a chunk with text information TextAnnotation - String representing tokens in this chunks (token text separated by space)
static void annotateChunkText(CoreMap chunk, CoreMap origAnnotation)
          Annotates a CoreMap representing a chunk with text information TextAnnotation - String extracted from the origAnnotation using character offset information for this chunk
static void annotateChunkTokens(CoreMap chunk, Class tokenChunkKey, Class tokenLabelKey)
          Annotates tokens in chunk
static boolean checkOffsets(CoreMap docAnnotation)
          Checks if offsets of doc and sentence matches
static void copyUnsetAnnotations(CoreMap src, CoreMap dest)
          Copies annotation over to this coremap if not already set
static boolean fixChunkSentenceBoundaries(CoreMap docAnnotation, List<IntPair> chunkCharOffsets)
          Give an list of character offsets for chunk, fix sentence splitting so sentences doesn't break the chunks
static boolean fixChunkSentenceBoundaries(CoreMap docAnnotation, List<IntPair> chunkCharOffsets, boolean offsetsAreNotSorted, boolean extendedFixSentence, boolean moreExtendedFixSentence)
          Give an list of character offsets for chunk, fix sentence splitting so sentences doesn't break the chunks
static boolean fixChunkTokenBoundaries(CoreMap docAnnotation, List<IntPair> chunkCharOffsets)
          Give an list of character offsets for chunk, fix tokenization so tokenization occurs at boundary of chunks
static boolean fixTokenOffsets(CoreMap docAnnotation)
          Fix token offsets of sentences to match those in the document (assumes tokens are shared) sentence token indices may not match document token list if certain html elements are ignored
static Annotation getAnnotatedChunk(CoreMap annotation, int tokenStartIndex, int tokenEndIndex)
          Create a new chunk Annotation with basic chunk information CharacterOffsetBeginAnnotation - set to CharacterOffsetBeginAnnotation of first token in chunk CharacterOffsetEndAnnotation - set to CharacterOffsetEndAnnotation of last token in chunk TokensAnnotation - List of tokens in this chunk TokenBeginAnnotation - Index of first token in chunk (index in original list of tokens) tokenStartIndex + annotation's TokenBeginAnnotation TokenEndAnnotation - Index of last token in chunk (index in original list of tokens) tokenEndIndex + annotation's TokenBeginAnnotation TextAnnotation - String extracted from the origAnnotation using character offset information for this chunk
static Annotation getAnnotatedChunk(CoreMap annotation, int tokenStartIndex, int tokenEndIndex, Class tokenChunkKey, Class tokenLabelKey)
          Create a new chunk Annotation with basic chunk information CharacterOffsetBeginAnnotation - set to CharacterOffsetBeginAnnotation of first token in chunk CharacterOffsetEndAnnotation - set to CharacterOffsetEndAnnotation of last token in chunk TokensAnnotation - List of tokens in this chunk TokenBeginAnnotation - Index of first token in chunk (index in original list of tokens) tokenStartIndex + annotation's TokenBeginAnnotation TokenEndAnnotation - Index of last token in chunk (index in original list of tokens) tokenEndIndex + annotation's TokenBeginAnnotation TextAnnotation - String extracted from the origAnnotation using character offset information for this chunk
static Annotation getAnnotatedChunk(List<CoreLabel> tokens, int tokenStartIndex, int tokenEndIndex, int totalTokenOffset)
          Create a new chunk Annotation with basic chunk information CharacterOffsetBeginAnnotation - set to CharacterOffsetBeginAnnotation of first token in chunk CharacterOffsetEndAnnotation - set to CharacterOffsetEndAnnotation of last token in chunk TokensAnnotation - List of tokens in this chunk TokenBeginAnnotation - Index of first token in chunk (index in original list of tokens) tokenStartIndex + totalTokenOffset TokenEndAnnotation - Index of last token in chunk (index in original list of tokens) tokenEndIndex + totalTokenOffset
static Annotation getAnnotatedChunk(List<CoreLabel> tokens, int tokenStartIndex, int tokenEndIndex, int totalTokenOffset, Class tokenChunkKey, Class tokenTextKey, Class tokenLabelKey)
          Create a new chunk Annotation with basic chunk information CharacterOffsetBeginAnnotation - set to CharacterOffsetBeginAnnotation of first token in chunk CharacterOffsetEndAnnotation - set to CharacterOffsetEndAnnotation of last token in chunk TokensAnnotation - List of tokens in this chunk TokenBeginAnnotation - Index of first token in chunk (index in original list of tokens) tokenStartIndex + totalTokenOffset TokenEndAnnotation - Index of last token in chunk (index in original list of tokens) tokenEndIndex + totalTokenOffset TextAnnotation - String extracted from the origAnnotation using character offset information for this chunk
static List<CoreMap> getAnnotatedChunksUsingSortedCharOffsets(CoreMap annotation, List<IntPair> charOffsets)
           
static List<CoreMap> getAnnotatedChunksUsingSortedCharOffsets(CoreMap annotation, List<IntPair> charOffsets, boolean charOffsetIsRelative, Class tokenChunkKey, Class tokenLabelKey, boolean allowPartialTokens)
          Create a list of new chunk Annotation with basic chunk information CharacterOffsetBeginAnnotation - set to CharacterOffsetBeginAnnotation of first token in chunk CharacterOffsetEndAnnotation - set to CharacterOffsetEndAnnotation of last token in chunk TokensAnnotation - List of tokens in this chunk TokenBeginAnnotation - Index of first token in chunk (index in original list of tokens) tokenStartIndex + annotation's TokenBeginAnnotation TokenEndAnnotation - Index of last token in chunk (index in original list of tokens) tokenEndIndex + annotation's TokenBeginAnnotation TextAnnotation - String extracted from the origAnnotation using character offset information for this chunk
static CoreMap getAnnotatedChunkUsingCharOffsets(CoreMap annotation, int charOffsetStart, int charOffsetEnd)
           
static Interval<Integer> getChunkOffsetsUsingCharOffsets(List<? extends CoreMap> chunkList, int charStart, int charEnd)
          Return chunk offsets
static Character getFirstNonWsChar(CoreMap sent)
           
static Integer getFirstNonWsCharOffset(CoreMap sent, boolean relative)
           
static CoreMap getMergedChunk(List<? extends CoreMap> chunkList, int chunkIndexStart, int chunkIndexEnd, Map<Class,CoreMapAttributeAggregator> aggregators)
          Create chunk that is merged from chunkIndexStart to chunkIndexEnd (exclusive)
static CoreMap getMergedChunk(List<? extends CoreMap> chunkList, String origText, int chunkIndexStart, int chunkIndexEnd)
          Create chunk that is merged from chunkIndexStart to chunkIndexEnd (exclusive)
static String getTokenText(List<? extends CoreMap> tokens, Class tokenTextKey)
           
static String getTokenText(List<? extends CoreMap> tokens, Class tokenTextKey, String delimiter)
           
static String getTrimmedText(CoreMap sent)
           
static void mergeChunks(List<CoreMap> chunkList, String origText, int chunkIndexStart, int chunkIndexEnd)
          Merge chunks from chunkIndexStart to chunkIndexEnd (exclusive) and replace them in the list
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Constructor Detail

ChunkAnnotationUtils

public ChunkAnnotationUtils()
Method Detail

checkOffsets

public static boolean checkOffsets(CoreMap docAnnotation)
Checks if offsets of doc and sentence matches

Parameters:
docAnnotation -
Returns:

fixTokenOffsets

public static boolean fixTokenOffsets(CoreMap docAnnotation)
Fix token offsets of sentences to match those in the document (assumes tokens are shared) sentence token indices may not match document token list if certain html elements are ignored

Parameters:
docAnnotation -
Returns:
true if fix was okay, false otherwise

copyUnsetAnnotations

public static void copyUnsetAnnotations(CoreMap src,
                                        CoreMap dest)
Copies annotation over to this coremap if not already set


fixChunkTokenBoundaries

public static boolean fixChunkTokenBoundaries(CoreMap docAnnotation,
                                              List<IntPair> chunkCharOffsets)
Give an list of character offsets for chunk, fix tokenization so tokenization occurs at boundary of chunks

Parameters:
docAnnotation -
chunkCharOffsets -

getMergedChunk

public static CoreMap getMergedChunk(List<? extends CoreMap> chunkList,
                                     String origText,
                                     int chunkIndexStart,
                                     int chunkIndexEnd)
Create chunk that is merged from chunkIndexStart to chunkIndexEnd (exclusive)

Parameters:
chunkList - - List of chunks
origText - - Text from which to extract chunk text
chunkIndexStart - - Index of first chunk to merge
chunkIndexEnd - - Index of last chunk to merge (exclusive)
Returns:
new merged chunk

getMergedChunk

public static CoreMap getMergedChunk(List<? extends CoreMap> chunkList,
                                     int chunkIndexStart,
                                     int chunkIndexEnd,
                                     Map<Class,CoreMapAttributeAggregator> aggregators)
Create chunk that is merged from chunkIndexStart to chunkIndexEnd (exclusive)

Parameters:
chunkList - - List of chunks
chunkIndexStart - - Index of first chunk to merge
chunkIndexEnd - - Index of last chunk to merge (exclusive)
aggregators - - Aggregators
Returns:
new merged chunk

getChunkOffsetsUsingCharOffsets

public static Interval<Integer> getChunkOffsetsUsingCharOffsets(List<? extends CoreMap> chunkList,
                                                                int charStart,
                                                                int charEnd)
Return chunk offsets

Parameters:
chunkList - - List of chunks
charStart - - character begin offset
charEnd - - character end offset
Returns:
chunk offsets

mergeChunks

public static void mergeChunks(List<CoreMap> chunkList,
                               String origText,
                               int chunkIndexStart,
                               int chunkIndexEnd)
Merge chunks from chunkIndexStart to chunkIndexEnd (exclusive) and replace them in the list

Parameters:
chunkList - - List of chunks
origText - - Text from which to extract chunk text
chunkIndexStart - - Index of first chunk to merge
chunkIndexEnd - - Index of last chunk to merge (exclusive)

getFirstNonWsChar

public static Character getFirstNonWsChar(CoreMap sent)

getFirstNonWsCharOffset

public static Integer getFirstNonWsCharOffset(CoreMap sent,
                                              boolean relative)

getTrimmedText

public static String getTrimmedText(CoreMap sent)

fixChunkSentenceBoundaries

public static boolean fixChunkSentenceBoundaries(CoreMap docAnnotation,
                                                 List<IntPair> chunkCharOffsets)
Give an list of character offsets for chunk, fix sentence splitting so sentences doesn't break the chunks

Parameters:
docAnnotation - Document with sentences
chunkCharOffsets - ordered pairs of different chunks that should appear in sentences
Returns:
true if fix was okay (chunks are in all sentences), false otherwise

fixChunkSentenceBoundaries

public static boolean fixChunkSentenceBoundaries(CoreMap docAnnotation,
                                                 List<IntPair> chunkCharOffsets,
                                                 boolean offsetsAreNotSorted,
                                                 boolean extendedFixSentence,
                                                 boolean moreExtendedFixSentence)
Give an list of character offsets for chunk, fix sentence splitting so sentences doesn't break the chunks

Parameters:
docAnnotation - Document with sentences
chunkCharOffsets - ordered pairs of different chunks that should appear in sentences
offsetsAreNotSorted - Treat each pair of offsets as independent (look through all sentences again)
extendedFixSentence - Do extended sentence fixing based on some heuristics
moreExtendedFixSentence - Do even more extended sentence fixing based on some heuristics
Returns:
true if fix was okay (chunks are in all sentences), false otherwise

annotateChunk

public static void annotateChunk(CoreMap chunk,
                                 List<CoreLabel> tokens,
                                 int tokenStartIndex,
                                 int tokenEndIndex,
                                 int totalTokenOffset)
Annotates a CoreMap representing a chunk with basic chunk information CharacterOffsetBeginAnnotation - set to CharacterOffsetBeginAnnotation of first token in chunk CharacterOffsetEndAnnotation - set to CharacterOffsetEndAnnotation of last token in chunk TokensAnnotation - List of tokens in this chunk TokenBeginAnnotation - Index of first token in chunk (index in original list of tokens) tokenStartIndex + totalTokenOffset TokenEndAnnotation - Index of last token in chunk (index in original list of tokens) tokenEndIndex + totalTokenOffset

Parameters:
chunk - - CoreMap to be annotated
tokens - - List of tokens to look for chunks
tokenStartIndex - - Index (relative to current list of tokens) at which this chunk starts
tokenEndIndex - - Index (relative to current list of tokens) at which this chunk ends (not inclusive)
totalTokenOffset - - Index of tokens to offset by

getTokenText

public static String getTokenText(List<? extends CoreMap> tokens,
                                  Class tokenTextKey)

getTokenText

public static String getTokenText(List<? extends CoreMap> tokens,
                                  Class tokenTextKey,
                                  String delimiter)

annotateChunkText

public static void annotateChunkText(CoreMap chunk,
                                     Class tokenTextKey)
Annotates a CoreMap representing a chunk with text information TextAnnotation - String representing tokens in this chunks (token text separated by space)

Parameters:
chunk - - CoreMap to be annotated
tokenTextKey - - Key to use to find the token text

annotateChunkText

public static void annotateChunkText(CoreMap chunk,
                                     CoreMap origAnnotation)
Annotates a CoreMap representing a chunk with text information TextAnnotation - String extracted from the origAnnotation using character offset information for this chunk

Parameters:
chunk - - CoreMap to be annotated
origAnnotation - - Annotation from which to extract the text for this chunk

annotateChunkTokens

public static void annotateChunkTokens(CoreMap chunk,
                                       Class tokenChunkKey,
                                       Class tokenLabelKey)
Annotates tokens in chunk

Parameters:
chunk - - CoreMap representing chunk (should have TextAnnotation and TokensAnnotation)
tokenChunkKey - - If not null, each token is annotated with the chunk using this key
tokenLabelKey - - If not null, each token is annotated with the text associated with the chunk using this key

getAnnotatedChunk

public static Annotation getAnnotatedChunk(List<CoreLabel> tokens,
                                           int tokenStartIndex,
                                           int tokenEndIndex,
                                           int totalTokenOffset)
Create a new chunk Annotation with basic chunk information CharacterOffsetBeginAnnotation - set to CharacterOffsetBeginAnnotation of first token in chunk CharacterOffsetEndAnnotation - set to CharacterOffsetEndAnnotation of last token in chunk TokensAnnotation - List of tokens in this chunk TokenBeginAnnotation - Index of first token in chunk (index in original list of tokens) tokenStartIndex + totalTokenOffset TokenEndAnnotation - Index of last token in chunk (index in original list of tokens) tokenEndIndex + totalTokenOffset

Parameters:
tokens - - List of tokens to look for chunks
tokenStartIndex - - Index (relative to current list of tokens) at which this chunk starts
tokenEndIndex - - Index (relative to current list of tokens) at which this chunk ends (not inclusive)
totalTokenOffset - - Index of tokens to offset by
Returns:
Annotation representing new chunk

getAnnotatedChunk

public static Annotation getAnnotatedChunk(List<CoreLabel> tokens,
                                           int tokenStartIndex,
                                           int tokenEndIndex,
                                           int totalTokenOffset,
                                           Class tokenChunkKey,
                                           Class tokenTextKey,
                                           Class tokenLabelKey)
Create a new chunk Annotation with basic chunk information CharacterOffsetBeginAnnotation - set to CharacterOffsetBeginAnnotation of first token in chunk CharacterOffsetEndAnnotation - set to CharacterOffsetEndAnnotation of last token in chunk TokensAnnotation - List of tokens in this chunk TokenBeginAnnotation - Index of first token in chunk (index in original list of tokens) tokenStartIndex + totalTokenOffset TokenEndAnnotation - Index of last token in chunk (index in original list of tokens) tokenEndIndex + totalTokenOffset TextAnnotation - String extracted from the origAnnotation using character offset information for this chunk

Parameters:
tokens - - List of tokens to look for chunks
tokenStartIndex - - Index (relative to current list of tokens) at which this chunk starts
tokenEndIndex - - Index (relative to current list of tokens) at which this chunk ends (not inclusive)
totalTokenOffset - - Index of tokens to offset by
tokenChunkKey - - If not null, each token is annotated with the chunk using this key
tokenTextKey - - Key to use to find the token text
tokenLabelKey - - If not null, each token is annotated with the text associated with the chunk using this key
Returns:
Annotation representing new chunk

getAnnotatedChunk

public static Annotation getAnnotatedChunk(CoreMap annotation,
                                           int tokenStartIndex,
                                           int tokenEndIndex)
Create a new chunk Annotation with basic chunk information CharacterOffsetBeginAnnotation - set to CharacterOffsetBeginAnnotation of first token in chunk CharacterOffsetEndAnnotation - set to CharacterOffsetEndAnnotation of last token in chunk TokensAnnotation - List of tokens in this chunk TokenBeginAnnotation - Index of first token in chunk (index in original list of tokens) tokenStartIndex + annotation's TokenBeginAnnotation TokenEndAnnotation - Index of last token in chunk (index in original list of tokens) tokenEndIndex + annotation's TokenBeginAnnotation TextAnnotation - String extracted from the origAnnotation using character offset information for this chunk

Parameters:
annotation - - Annotation from which to extract the text for this chunk
tokenStartIndex - - Index (relative to current list of tokens) at which this chunk starts
tokenEndIndex - - Index (relative to current list of tokens) at which this chunk ends (not inclusive)
Returns:
Annotation representing new chunk

getAnnotatedChunk

public static Annotation getAnnotatedChunk(CoreMap annotation,
                                           int tokenStartIndex,
                                           int tokenEndIndex,
                                           Class tokenChunkKey,
                                           Class tokenLabelKey)
Create a new chunk Annotation with basic chunk information CharacterOffsetBeginAnnotation - set to CharacterOffsetBeginAnnotation of first token in chunk CharacterOffsetEndAnnotation - set to CharacterOffsetEndAnnotation of last token in chunk TokensAnnotation - List of tokens in this chunk TokenBeginAnnotation - Index of first token in chunk (index in original list of tokens) tokenStartIndex + annotation's TokenBeginAnnotation TokenEndAnnotation - Index of last token in chunk (index in original list of tokens) tokenEndIndex + annotation's TokenBeginAnnotation TextAnnotation - String extracted from the origAnnotation using character offset information for this chunk

Parameters:
annotation - - Annotation from which to extract the text for this chunk
tokenStartIndex - - Index (relative to current list of tokens) at which this chunk starts
tokenEndIndex - - Index (relative to current list of tokens) at which this chunk ends (not inclusive)
tokenChunkKey - - If not null, each token is annotated with the chunk using this key
tokenLabelKey - - If not null, each token is annotated with the text associated with the chunk using this key
Returns:
Annotation representing new chunk

getAnnotatedChunkUsingCharOffsets

public static CoreMap getAnnotatedChunkUsingCharOffsets(CoreMap annotation,
                                                        int charOffsetStart,
                                                        int charOffsetEnd)

getAnnotatedChunksUsingSortedCharOffsets

public static List<CoreMap> getAnnotatedChunksUsingSortedCharOffsets(CoreMap annotation,
                                                                     List<IntPair> charOffsets)

getAnnotatedChunksUsingSortedCharOffsets

public static List<CoreMap> getAnnotatedChunksUsingSortedCharOffsets(CoreMap annotation,
                                                                     List<IntPair> charOffsets,
                                                                     boolean charOffsetIsRelative,
                                                                     Class tokenChunkKey,
                                                                     Class tokenLabelKey,
                                                                     boolean allowPartialTokens)
Create a list of new chunk Annotation with basic chunk information CharacterOffsetBeginAnnotation - set to CharacterOffsetBeginAnnotation of first token in chunk CharacterOffsetEndAnnotation - set to CharacterOffsetEndAnnotation of last token in chunk TokensAnnotation - List of tokens in this chunk TokenBeginAnnotation - Index of first token in chunk (index in original list of tokens) tokenStartIndex + annotation's TokenBeginAnnotation TokenEndAnnotation - Index of last token in chunk (index in original list of tokens) tokenEndIndex + annotation's TokenBeginAnnotation TextAnnotation - String extracted from the origAnnotation using character offset information for this chunk

Parameters:
annotation - - Annotation from which to extract the text for this chunk
charOffsets - - List of start and end (not inclusive) character offsets Note: assume char offsets are sorted and nonoverlapping!!!
charOffsetIsRelative - - Whether the character offsets are relative to the current annotation or absolute offsets
tokenChunkKey - - If not null, each token is annotated with the chunk using this key
tokenLabelKey - - If not null, each token is annotated with the text associated with the chunk using this key
allowPartialTokens - - Whether to allow partial tokens or not
Returns:
List of annotation representing new chunks

annotateChunk

public static void annotateChunk(CoreMap annotation,
                                 Class newAnnotationKey,
                                 Class aggrKey,
                                 CoreMapAttributeAggregator aggregator)

annotateChunk

public static void annotateChunk(CoreMap chunk,
                                 Map<String,String> attributes)

annotateChunks

public static void annotateChunks(List<? extends CoreMap> chunks,
                                  int start,
                                  int end,
                                  Map<String,String> attributes)

annotateChunks

public static void annotateChunks(List<? extends CoreMap> chunks,
                                  Map<String,String> attributes)


Stanford NLP Group