// Stanford TMT Example 1 - Loading data // http://nlp.stanford.edu/software/tmt/0.4/ // tells Scala where to find the TMT classes import scalanlp.io._; import scalanlp.stage._; import scalanlp.stage.text._; import scalanlp.text.tokenize._; import scalanlp.pipes.Pipes.global._; import edu.stanford.nlp.tmt.stage._; import edu.stanford.nlp.tmt.model.lda._; import edu.stanford.nlp.tmt.model.llda._; val source = CSVFile("pubmed-oa-subset.csv") ~> IDColumn(1); val tokenizer = { SimpleEnglishTokenizer() ~> // tokenize on space and punctuation CaseFolder() ~> // lowercase everything WordsAndNumbersOnlyFilter() ~> // ignore non-words and non-numbers MinimumLengthFilter(3) // take terms with >=3 characters } val text = { source ~> // read from the source file Column(4) ~> // select column containing text TokenizeWith(tokenizer) ~> // tokenize with tokenizer above TermCounter() ~> // collect counts (needed below) TermMinimumDocumentCountFilter(4) ~> // filter terms in <4 docs TermDynamicStopListFilter(30) ~> // filter out 30 most common terms DocumentMinimumLengthFilter(5) // take only docs with >=5 terms } // display information about the loaded dataset println("Description of the loaded text field:"); println(text.description); println(); println("------------------------------------"); println(); println("Terms in the stop list:"); for (term <- text.meta[TermStopList]) { println(" " + term); }