edu.stanford.nlp.util
Class StringUtils

java.lang.Object
  extended by edu.stanford.nlp.util.StringUtils

public class StringUtils
extends java.lang.Object

StringUtils is a class for random String things, including output formatting and command line argument parsing.


Field Summary
static java.lang.String[] EMPTY_STRING_ARRAY
           
 
Method Summary
static java.util.Map<java.lang.String,java.lang.String[]> argsToMap(java.lang.String[] args)
          Parses command line arguments into a Map.
static java.util.Map<java.lang.String,java.lang.String[]> argsToMap(java.lang.String[] args, java.util.Map<java.lang.String,java.lang.Integer> flagsToNumArgs)
          Parses command line arguments into a Map.
static java.util.Properties argsToProperties(java.lang.String[] args)
          In this version each flag has zero or one argument.
static java.util.Properties argsToProperties(java.lang.String[] args, java.util.Map flagsToNumArgs)
          Analagous to argsToMap(java.lang.String[]).
static java.lang.String capitalize(java.lang.String s)
          Uppercases the first character of a string.
static int editDistance(java.lang.String s, java.lang.String t)
          Computes the Levenshtein (edit) distance of the two given Strings.
static java.lang.String escapeString(java.lang.String s, char[] charsToEscape, char escapeChar)
           
static java.lang.String fileNameClean(java.lang.String s)
          Returns a "clean" version of the given filename in which spaces have been converted to dashes and all non-alphanumeric chars are underscores.
static boolean find(java.lang.String str, java.lang.String regex)
          Say whether this regular expression can be found inside this String.
static java.lang.String getShortClassName(java.lang.Object o)
          Returns a short class name for an object.
static java.lang.String join(java.lang.Iterable l, java.lang.String glue)
          Joins each elem in the Collection with the given glue.
static java.lang.String join(java.util.List l)
          Joins elems with a space.
static java.lang.String join(java.util.List l, java.lang.String glue)
          Joins each elem in the List with the given glue.
static java.lang.String join(java.lang.Object[] elements)
          Joins elems with a space.
static java.lang.String join(java.lang.Object[] elements, java.lang.String glue)
          Joins each elem in the array with the given glue.
static int longestCommonContiguousSubstring(java.lang.String s, java.lang.String t)
          Computes the longest common contiguous substring of s and t.
static int longestCommonSubstring(java.lang.String s, java.lang.String t)
          Computes the longest common substring of s and t.
static boolean lookingAt(java.lang.String str, java.lang.String regex)
          Say whether this regular expression can be found at the beginning of this String.
static void main(java.lang.String[] args)
          Tests the string edit distance function.
static java.lang.String makeHTMLTable(java.lang.String[][] table, java.lang.String[] rowLabels, java.lang.String[] colLabels)
          Returns an HTML table containing the matrix of Strings passed in.
static boolean matches(java.lang.String str, java.lang.String regex)
          Say whether this regular expression matches this String.
static int nthIndex(java.lang.String s, char ch, int n)
          Returns the index of the nth occurrence of ch in s, or -1 if there are less than n occurrences of ch.
static java.lang.String pad(java.lang.Object obj, int totalChars)
          Pads the toString value of the given Object.
static java.lang.String pad(java.lang.String str, int totalChars)
          Return a String of length a minimum of totalChars characters by padding the input String str with spaces.
static java.lang.String padLeft(double d, int totalChars)
           
static java.lang.String padLeft(int i, int totalChars)
           
static java.lang.String padLeft(java.lang.Object obj, int totalChars)
           
static java.lang.String padLeft(java.lang.String str, int totalChars)
          Pads the given String to the left with spaces to ensure that it's at least totalChars long.
static java.lang.String padLeftOrTrim(java.lang.String str, int num)
          Pad or trim so as to produce a string of exactly a certain length.
static java.lang.String padOrTrim(java.lang.Object obj, int totalChars)
          Pad or trim the toString value of the given Object.
static java.lang.String padOrTrim(java.lang.String str, int num)
          Pad or trim so as to produce a string of exactly a certain length.
static java.util.Map<java.lang.String,java.lang.Object> parseCommandLineArguments(java.lang.String[] args)
          A simpler form of command line argument parsing.
static java.util.Map<java.lang.String,java.lang.Object> parseCommandLineArguments(java.lang.String[] args, boolean parseNumbers)
          A simpler form of command line argument parsing.
static java.lang.String pennPOSToWordnetPOS(java.lang.String s)
          Computes the WordNet 2.0 POS tag corresponding to the PTB POS tag s.
static void printStringOneCharPerLine(java.lang.String s)
           
static void printToFile(java.io.File file, java.lang.String message)
          Prints to a file.
static void printToFile(java.io.File file, java.lang.String message, boolean append)
          Prints to a file.
static void printToFile(java.lang.String filename, java.lang.String message)
          Prints to a file.
static void printToFile(java.lang.String filename, java.lang.String message, boolean append)
          Prints to a file.
static void printToFileLn(java.io.File file, java.lang.String message, boolean append)
          Prints to a file.
static void printToFileLn(java.lang.String filename, java.lang.String message, boolean append)
          Prints to a file.
static java.lang.String repeat(java.lang.String s, int times)
           
static java.lang.String searchAndReplace(java.lang.String text, java.lang.String from, java.lang.String to)
           
static java.lang.String slurpFile(java.io.File file)
          Returns all the text in the given File.
static java.lang.String slurpFile(java.lang.String filename)
          Returns all the text in the given file
static java.lang.String slurpFile(java.lang.String filename, java.lang.String encoding)
          Returns all the text in the given file with the given encoding.
static java.lang.String slurpFileNoExceptions(java.io.File file)
          Returns all the text in the given File.
static java.lang.String slurpFileNoExceptions(java.lang.String filename)
          Returns all the text in the given File.
static java.lang.String slurpFileNoExceptions(java.lang.String filename, java.lang.String encoding)
          Returns all the text in the given file with the given encoding.
static java.lang.String slurpGBFile(java.lang.String filename)
           
static java.lang.String slurpGBFileNoExceptions(java.lang.String filename)
           
static java.lang.String slurpGBURL(java.net.URL u)
          Returns all the text at the given URL.
static java.lang.String slurpGBURLNoExceptions(java.net.URL u)
          Returns all the text at the given URL.
static java.lang.String slurpReader(java.io.Reader reader)
          Returns all the text from the given Reader.
static java.lang.String slurpURL(java.lang.String path)
          Returns all the text at the given URL.
static java.lang.String slurpURL(java.net.URL u)
          Returns all the text at the given URL.
static java.lang.String slurpURL(java.net.URL u, java.lang.String encoding)
          Returns all the text at the given URL.
static java.lang.String slurpURLNoExceptions(java.lang.String path)
          Returns all the text at the given URL.
static java.lang.String slurpURLNoExceptions(java.net.URL u)
          Returns all the text at the given URL.
static java.lang.String slurpURLNoExceptions(java.net.URL u, java.lang.String encoding)
          Returns all the text at the given URL.
static java.util.List split(java.lang.String s)
          Splits on whitespace (\\s+).
static java.util.List split(java.lang.String str, java.lang.String regex)
          Splits the given string using the given regex as delimiters.
static java.lang.String[] splitOnCharWithQuoting(java.lang.String s, char splitChar, char quoteChar, char escapeChar)
          This function splits the String s into multiple Strings using the splitChar.
static java.util.Properties stringToProperties(java.lang.String str)
          This method converts a comma-separated String (with whitespace optionally allowed after the comma) representing properties to a Properties object.
static java.lang.String stripNonAlphaNumerics(java.lang.String orig)
           
static java.lang.String toAscii(java.lang.String s)
           
static java.lang.String trim(java.lang.Object obj, int maxWidth)
           
static java.lang.String trim(java.lang.String s, int maxWidth)
          Returns s if it's at most maxWidth chars, otherwise chops right side to fit.
static java.lang.String truncate(int n, int smallestDigit, int biggestDigit)
          This returns a string from decimal digit smallestDigit to decimal digit biggest digit.
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

EMPTY_STRING_ARRAY

public static final java.lang.String[] EMPTY_STRING_ARRAY
Method Detail

find

public static boolean find(java.lang.String str,
                           java.lang.String regex)
Say whether this regular expression can be found inside this String. This method provides one of the two "missing" convenience methods for regular expressions in the String class in JDK1.4. This is the one you'll want to use all the time if you're used to Perl. What were they smoking?

Parameters:
str - String to search for match in
regex - String to compile as the regular expression
Returns:
Whether the regex can be found in str

lookingAt

public static boolean lookingAt(java.lang.String str,
                                java.lang.String regex)
Say whether this regular expression can be found at the beginning of this String. This method provides one of the two "missing" convenience methods for regular expressions in the String class in JDK1.4.

Parameters:
str - String to search for match at start of
regex - String to compile as the regular expression
Returns:
Whether the regex can be found at the start of str

matches

public static boolean matches(java.lang.String str,
                              java.lang.String regex)
Say whether this regular expression matches this String. This method is the same as the String.matches() method, and is included just to give a call that is parallel to the other static regex methods in this class.

Parameters:
str - String to search for match at start of
regex - String to compile as the regular expression
Returns:
Whether the regex matches the whole of this str

slurpFile

public static java.lang.String slurpFile(java.io.File file)
                                  throws java.io.IOException
Returns all the text in the given File.

Throws:
java.io.IOException

slurpGBFileNoExceptions

public static java.lang.String slurpGBFileNoExceptions(java.lang.String filename)

slurpFile

public static java.lang.String slurpFile(java.lang.String filename,
                                         java.lang.String encoding)
                                  throws java.io.IOException
Returns all the text in the given file with the given encoding.

Throws:
java.io.IOException

slurpFileNoExceptions

public static java.lang.String slurpFileNoExceptions(java.lang.String filename,
                                                     java.lang.String encoding)
Returns all the text in the given file with the given encoding. If the file cannot be read (non-existent, etc.), then and only then the method returns null.


slurpGBFile

public static java.lang.String slurpGBFile(java.lang.String filename)
                                    throws java.io.IOException
Throws:
java.io.IOException

slurpReader

public static java.lang.String slurpReader(java.io.Reader reader)
Returns all the text from the given Reader.

Returns:
The text in the file.

slurpFile

public static java.lang.String slurpFile(java.lang.String filename)
                                  throws java.io.IOException
Returns all the text in the given file

Returns:
The text in the file.
Throws:
java.io.IOException

slurpFileNoExceptions

public static java.lang.String slurpFileNoExceptions(java.io.File file)
Returns all the text in the given File.

Returns:
The text in the file. May be an empty string if the file is empty. If the file cannot be read (non-existent, etc.), then and only then the method returns null.

slurpFileNoExceptions

public static java.lang.String slurpFileNoExceptions(java.lang.String filename)
Returns all the text in the given File.

Returns:
The text in the file. May be an empty string if the file is empty. If the file cannot be read (non-existent, etc.), then and only then the method returns null.

slurpGBURL

public static java.lang.String slurpGBURL(java.net.URL u)
                                   throws java.io.IOException
Returns all the text at the given URL.

Throws:
java.io.IOException

slurpGBURLNoExceptions

public static java.lang.String slurpGBURLNoExceptions(java.net.URL u)
Returns all the text at the given URL.


slurpURLNoExceptions

public static java.lang.String slurpURLNoExceptions(java.net.URL u,
                                                    java.lang.String encoding)
Returns all the text at the given URL.


slurpURL

public static java.lang.String slurpURL(java.net.URL u,
                                        java.lang.String encoding)
                                 throws java.io.IOException
Returns all the text at the given URL.

Throws:
java.io.IOException

slurpURL

public static java.lang.String slurpURL(java.net.URL u)
                                 throws java.io.IOException
Returns all the text at the given URL.

Throws:
java.io.IOException

slurpURLNoExceptions

public static java.lang.String slurpURLNoExceptions(java.net.URL u)
Returns all the text at the given URL.


slurpURL

public static java.lang.String slurpURL(java.lang.String path)
                                 throws java.lang.Exception
Returns all the text at the given URL.

Throws:
java.lang.Exception

slurpURLNoExceptions

public static java.lang.String slurpURLNoExceptions(java.lang.String path)
Returns all the text at the given URL. If the file cannot be read (non-existent, etc.), then and only then the method returns null.


join

public static java.lang.String join(java.lang.Iterable l,
                                    java.lang.String glue)
Joins each elem in the Collection with the given glue. For example, given a list of Integers, you can create a comma-separated list by calling join(numbers, ", ").


join

public static java.lang.String join(java.util.List l,
                                    java.lang.String glue)
Joins each elem in the List with the given glue. For example, given a list of Integers, you can create a comma-separated list by calling join(numbers, ", ").


join

public static java.lang.String join(java.lang.Object[] elements,
                                    java.lang.String glue)
Joins each elem in the array with the given glue. For example, given a list of ints, you can create a comma-separated list by calling join(numbers, ", ").


join

public static java.lang.String join(java.util.List l)
Joins elems with a space.


join

public static java.lang.String join(java.lang.Object[] elements)
Joins elems with a space.


split

public static java.util.List split(java.lang.String s)
Splits on whitespace (\\s+).


split

public static java.util.List split(java.lang.String str,
                                   java.lang.String regex)
Splits the given string using the given regex as delimiters. This method is the same as the String.split() method (except it throws the results in a List), and is included just to give a call that is parallel to the other static regex methods in this class.

Parameters:
str - String to split up
regex - String to compile as the regular expression
Returns:
List of Strings resulting from splitting on the regex

pad

public static java.lang.String pad(java.lang.String str,
                                   int totalChars)
Return a String of length a minimum of totalChars characters by padding the input String str with spaces. If str is already longer than totalChars, it is returned unchanged.


pad

public static java.lang.String pad(java.lang.Object obj,
                                   int totalChars)
Pads the toString value of the given Object.


padOrTrim

public static java.lang.String padOrTrim(java.lang.String str,
                                         int num)
Pad or trim so as to produce a string of exactly a certain length.

Parameters:
str - The String to be padded or truncated
num - The desired length

padLeftOrTrim

public static java.lang.String padLeftOrTrim(java.lang.String str,
                                             int num)
Pad or trim so as to produce a string of exactly a certain length.

Parameters:
str - The String to be padded or truncated
num - The desired length

padOrTrim

public static java.lang.String padOrTrim(java.lang.Object obj,
                                         int totalChars)
Pad or trim the toString value of the given Object.


padLeft

public static java.lang.String padLeft(java.lang.String str,
                                       int totalChars)
Pads the given String to the left with spaces to ensure that it's at least totalChars long.


padLeft

public static java.lang.String padLeft(java.lang.Object obj,
                                       int totalChars)

padLeft

public static java.lang.String padLeft(int i,
                                       int totalChars)

padLeft

public static java.lang.String padLeft(double d,
                                       int totalChars)

trim

public static java.lang.String trim(java.lang.String s,
                                    int maxWidth)
Returns s if it's at most maxWidth chars, otherwise chops right side to fit.


trim

public static java.lang.String trim(java.lang.Object obj,
                                    int maxWidth)

repeat

public static java.lang.String repeat(java.lang.String s,
                                      int times)

fileNameClean

public static java.lang.String fileNameClean(java.lang.String s)
Returns a "clean" version of the given filename in which spaces have been converted to dashes and all non-alphanumeric chars are underscores.


nthIndex

public static int nthIndex(java.lang.String s,
                           char ch,
                           int n)
Returns the index of the nth occurrence of ch in s, or -1 if there are less than n occurrences of ch.


truncate

public static java.lang.String truncate(int n,
                                        int smallestDigit,
                                        int biggestDigit)
This returns a string from decimal digit smallestDigit to decimal digit biggest digit. Smallest digit is labeled 1, and the limits are inclusive.


argsToMap

public static java.util.Map<java.lang.String,java.lang.String[]> argsToMap(java.lang.String[] args)
Parses command line arguments into a Map. Arguments of the form

-flag1 arg1a arg1b ... arg1m -flag2 -flag3 arg3a ... arg3n

will be parsed so that the flag is a key in the Map (including the hyphen) and its value will be a String[] containing the optional arguments (if present). The non-flag values not captured as flag arguments are collected into a String[] array and returned as the value of null in the Map. In this invocation, flags cannot take arguments, so all the String array values other than the value for null will be zero-length.

Parameters:
args - A command-line arguments array
Returns:
a Map of flag names to flag argument String arrays.

argsToMap

public static java.util.Map<java.lang.String,java.lang.String[]> argsToMap(java.lang.String[] args,
                                                                           java.util.Map<java.lang.String,java.lang.Integer> flagsToNumArgs)
Parses command line arguments into a Map. Arguments of the form

-flag1 arg1a arg1b ... arg1m -flag2 -flag3 arg3a ... arg3n

will be parsed so that the flag is a key in the Map (including the hyphen) and its value will be a String[] containing the optional arguments (if present). The non-flag values not captured as flag arguments are collected into a String[] array and returned as the value of null in the Map. In this invocation, the maximum number of arguments for each flag can be specified as an Integer value of the appropriate flag key in the flagsToNumArgs Map argument. (By default, flags cannot take arguments.)

Example of usage:

Map flagsToNumArgs = new HashMap(); flagsToNumArgs.put("-x",new Integer(2)); flagsToNumArgs.put("-d",new Integer(1)); Map result = argsToMap(args,flagsToNumArgs);

If a given flag appears more than once, the extra args are appended to the String[] value for that flag.

Parameters:
args - the argument array to be parsed
flagsToNumArgs - a Map of flag names to Integer values specifying the maximum number of allowed arguments for that flag (default 0).
Returns:
a Map of flag names to flag argument String arrays.

argsToProperties

public static java.util.Properties argsToProperties(java.lang.String[] args)
In this version each flag has zero or one argument.


argsToProperties

public static java.util.Properties argsToProperties(java.lang.String[] args,
                                                    java.util.Map flagsToNumArgs)
Analagous to argsToMap(java.lang.String[]). However, there are several key differences between this method and argsToMap(java.lang.String[]):

stringToProperties

public static java.util.Properties stringToProperties(java.lang.String str)
This method converts a comma-separated String (with whitespace optionally allowed after the comma) representing properties to a Properties object. Each property is "property=value". The value for properties without an explicitly given value is set to "true".


printToFileLn

public static void printToFileLn(java.io.File file,
                                 java.lang.String message,
                                 boolean append)
Prints to a file. If the file already exists, appends if append=true, and overwrites if append=false


printToFile

public static void printToFile(java.io.File file,
                               java.lang.String message,
                               boolean append)
Prints to a file. If the file already exists, appends if append=true, and overwrites if append=false


printToFile

public static void printToFile(java.io.File file,
                               java.lang.String message)
Prints to a file. If the file does not exist, rewrites the file; does not append.


printToFile

public static void printToFile(java.lang.String filename,
                               java.lang.String message,
                               boolean append)
Prints to a file. If the file already exists, appends if append=true, and overwrites if append=false


printToFileLn

public static void printToFileLn(java.lang.String filename,
                                 java.lang.String message,
                                 boolean append)
Prints to a file. If the file already exists, appends if append=true, and overwrites if append=false


printToFile

public static void printToFile(java.lang.String filename,
                               java.lang.String message)
Prints to a file. If the file does not exist, rewrites the file; does not append.


parseCommandLineArguments

public static java.util.Map<java.lang.String,java.lang.Object> parseCommandLineArguments(java.lang.String[] args)
A simpler form of command line argument parsing. Dan thinks this is highly superior to the overly complexified code that comes before it. Parses command line arguments into a Map. Arguments of the form -flag1 arg1 -flag2 -flag3 arg3 will be parsed so that the flag is a key in the Map (including the hyphen) and the optional argument will be its value (if present).

Parameters:
args -
Returns:
A Map from keys to possible values (String or null)

parseCommandLineArguments

public static java.util.Map<java.lang.String,java.lang.Object> parseCommandLineArguments(java.lang.String[] args,
                                                                                         boolean parseNumbers)
A simpler form of command line argument parsing. Dan thinks this is highly superior to the overly complexified code that comes before it. Parses command line arguments into a Map. Arguments of the form -flag1 arg1 -flag2 -flag3 arg3 will be parsed so that the flag is a key in the Map (including the hyphen) and the optional argument will be its value (if present). In this version, if the argument is numeric, it will be a Double value in the map, not a String.

Parameters:
args -
Returns:
A Map from keys to possible values (String or null)

stripNonAlphaNumerics

public static java.lang.String stripNonAlphaNumerics(java.lang.String orig)

printStringOneCharPerLine

public static void printStringOneCharPerLine(java.lang.String s)

escapeString

public static java.lang.String escapeString(java.lang.String s,
                                            char[] charsToEscape,
                                            char escapeChar)

splitOnCharWithQuoting

public static java.lang.String[] splitOnCharWithQuoting(java.lang.String s,
                                                        char splitChar,
                                                        char quoteChar,
                                                        char escapeChar)
This function splits the String s into multiple Strings using the splitChar. However, it provides an quoting facility: it is possible to quote strings with the quoteChar. If the quoteChar occurs within the quotedExpression, it must be prefaced by the escapeChar

Parameters:
s - The String to split
splitChar -
quoteChar -
Returns:
An array of Strings that s is split into

longestCommonSubstring

public static int longestCommonSubstring(java.lang.String s,
                                         java.lang.String t)
Computes the longest common substring of s and t. The longest common substring of a and b is the longest run of characters that appear in order inside both a and b. Both a and b may have other extraneous characters along the way. This is like edit distance but with no substitution and a higher number means more similar. For example, the LCS of "abcD" and "aXbc" is 3 (abc).


longestCommonContiguousSubstring

public static int longestCommonContiguousSubstring(java.lang.String s,
                                                   java.lang.String t)
Computes the longest common contiguous substring of s and t. The LCCS is the longest run of characters that appear consecutively in both s and t. For instance, the LCCS of "color" and "colour" is 4, because of "colo".


editDistance

public static int editDistance(java.lang.String s,
                               java.lang.String t)
Computes the Levenshtein (edit) distance of the two given Strings.


pennPOSToWordnetPOS

public static java.lang.String pennPOSToWordnetPOS(java.lang.String s)
Computes the WordNet 2.0 POS tag corresponding to the PTB POS tag s.

Parameters:
s - a Penn TreeBank POS tag.

getShortClassName

public static java.lang.String getShortClassName(java.lang.Object o)
Returns a short class name for an object. This is the class name stripped of any package name.

Returns:
The name of the class minus a package name, for example ArrayList

capitalize

public static java.lang.String capitalize(java.lang.String s)
Uppercases the first character of a string.

Parameters:
s - a string to capitalize
Returns:
a capitalized version of the string

searchAndReplace

public static java.lang.String searchAndReplace(java.lang.String text,
                                                java.lang.String from,
                                                java.lang.String to)

makeHTMLTable

public static java.lang.String makeHTMLTable(java.lang.String[][] table,
                                             java.lang.String[] rowLabels,
                                             java.lang.String[] colLabels)
Returns an HTML table containing the matrix of Strings passed in. The first dimension of the matrix should represent the rows, and the second dimension the columns.


main

public static void main(java.lang.String[] args)
Tests the string edit distance function.


toAscii

public static java.lang.String toAscii(java.lang.String s)