Not sure on the is stopwords
in the function, I imagine it needs to be in
but you can use a Counterdict with most_common(10)
to get the 10 most frequent:
from collections import Counter
from string import punctuation
def content_text(text):
stopwords = set(nltk.corpus.stopwords.words('english')) # 0(1) lookups
with_stp = Counter()
without_stp = Counter()
with open(text) as f:
for line in f:
spl = line.split()
# update count off all words in the line that are in stopwrods
with_stp.update(w.lower().rstrip(punctuation) for w in spl if w.lower() in stopwords)
# update count off all words in the line that are not in stopwords
without_stp.update(w.lower().rstrip(punctuation) for w in spl if w not in stopwords)
# return a list with top ten most common words from each
return [x for x in with_stp.most_common(10)],[y for y in without_stp.most_common(10)]
wth_stop, wthout_stop = content_text(...)
If you are passing in an nltk file object just iterate over it:
def content_text(text):
stopwords = set(nltk.corpus.stopwords.words('english'))
with_stp = Counter()
without_stp = Counter()
for word in text:
# update count off all words in the line that are in stopwords
word = word.lower()
if word in stopwords:
with_stp.update([word])
else:
# update count off all words in the line that are not in stopwords
without_stp.update([word])
# return a list with top ten most common words from each
return [k for k,_ in with_stp.most_common(10)],[y for y,_ in without_stp.most_common(10)]
print(content_text(nltk.corpus.inaugural.words('2009-Obama.txt')))
The nltk method includes punctuation so that may not be what you want.