# Some commands used with the CMU-Cambridge Language Modelling Toolkit # Get word frequencies. cat austen.txt | text2wfreq >a.wfreq # Number of words wc -l a.wfreq # Make vocab (of all words here) cat a.wfreq | wfreq2vocab -top 15000 > a.vocab # n-grams output as words in text files cat austen.txt | text2wngram -n 2 -temp /tmp >a.w2gram # If you do that once with a large -n, you can then use: ngram2mgram -words -n 5 -m 4 a.w4gram # Building the LMs used at the end of the chapter cat austen.txt | text2idngram -n 3 -vocab a.vocab -temp /tmp > a.id3gram cat austen.txt | text2idngram -n 2 -vocab a.vocab -temp /tmp > a.id2gram cat austen.txt | text2idngram -n 4 -vocab a.vocab -temp /tmp > a.id4gram idngram2lm -idngram a.id2gram -vocab a.vocab -n 2 -binary a.gt2binlm idngram2lm -idngram a.id3gram -vocab a.vocab -n 3 -binary a.gt3binlm idngram2lm -idngram a.id4gram -vocab a.vocab -n 4 -binary a.gt4binlm evallm -binary a.gt2binlm perplexity -text ja-pers-clean.txt perplexity -text ja-pers-tiny.txt -annotate ja-pers-tiny.2ann evallm -binary a.gt3binlm perplexity -text ja-pers-clean.txt perplexity -text ja-pers-tiny.txt -annotate ja-pers-tiny.3ann evallm -binary a.gt4binlm perplexity -text ja-pers-clean.txt perplexity -text ja-pers-tiny.txt -annotate ja-pers-tiny.4ann