# Ruby 2.0 # Reads stdin: ruby -n preprocess-twitter.rb # # Script for preprocessing tweets by Romain Paulus # with small modifications by Jeffrey Pennington def tokenize input # Different regex parts for smiley faces eyes = "[8:=;]" nose = "['`\-]?" input = input .gsub(/https?:\/\/\S+\b|www\.(\w+\.)+\S*/,"") .gsub("/"," / ") # Force splitting words appended with slashes (once we tokenized the URLs, of course) .gsub(/@\w+/, "") .gsub(/#{eyes}#{nose}[)d]+|[)d]+#{nose}#{eyes}/i, "") .gsub(/#{eyes}#{nose}p+/i, "") .gsub(/#{eyes}#{nose}\(+|\)+#{nose}#{eyes}/, "") .gsub(/#{eyes}#{nose}[\/|l*]/, "") .gsub(/<3/,"") .gsub(/[-+]?[.\d]*[\d]+[:,.\d]*/, "") .gsub(/#\S+/){ |hashtag| # Split hashtags on uppercase letters # TODO: also split hashtags with lowercase letters (requires more work to detect splits...) hashtag_body = hashtag[1..-1] if hashtag_body.upcase == hashtag_body result = " #{hashtag_body} " else result = ([""] + hashtag_body.split(/(?=[A-Z])/)).join(" ") end result } .gsub(/([!?.]){2,}/){ # Mark punctuation repetitions (eg. "!!!" => "! ") "#{$~[1]} " } .gsub(/\b(\S*?)(.)\2{2,}\b/){ # Mark elongated words (eg. "wayyyy" => "way ") # TODO: determine if the end letter should be repeated once or twice (use lexicon/dict) $~[1] + $~[2] + " " } .gsub(/([^a-z0-9()<>'`\-]){2,}/){ |word| "#{word.downcase} " } return input end puts tokenize($_)