Here is a stream based approach. This will not required too much memory while computing n-grams.
object ngramstream extends App {
def process(st: Stream[Array[String]])(f: Array[String] => Unit): Stream[Array[String]] = st match {
case x #:: xs => {
f(x)
process(xs)(f)
}
case _ => Stream[Array[String]]()
}
def ngrams(n: Int, words: Array[String]) = {
// exclude 1-grams
(2 to n).map { i => words.sliding(i).toStream }
.foldLeft(Stream[Array[String]]()) {
(a, b) => a #::: b
}
}
val words = "the bee is the bee of the bees"
val n = 4
val ngrams2 = ngrams(n, words.split(" "))
process(ngrams2) { x =>
println(x.toList)
}
}
OUTPUT:
List(the, bee)
List(bee, is)
List(is, the)
List(the, bee)
List(bee, of)
List(of, the)
List(the, bees)
List(the, bee, is)
List(bee, is, the)
List(is, the, bee)
List(the, bee, of)
List(bee, of, the)
List(of, the, bees)
List(the, bee, is, the)
List(bee, is, the, bee)
List(is, the, bee, of)
List(the, bee, of, the)
List(bee, of, the, bees)