Position of that Noun and Verb

for text_id, text in enumerate(news_df['news_title'].values): # Remove the comma and full stops text = text.replace(',', '').replace('.', '').replace('-','') sentence_tags = POSTAG(text.lower()) print(text) # Sentences parts for index, part in enumerate(sentence_tags): try: if 'NN' in part[1] and 'VB' in sentence_tags[index + 1][1]: print(">", part[0]) break elif 'NN' in part[1] and 'NN' in sentence_tags[index + 1][1] and 'VB' in sentence_tags[index + 2][1]: print(">", part[0], sentence_tags[index + 1][0]) break elif 'NN' in part[1] and 'NN' in sentence_tags[index + 1][1] and 'NN' in sentence_tags[index + 2][1] and 'VB' in sentence_tags[index + 3][1]: print(">", part[0], sentence_tags[index + 1][0], sentence_tags[index + 2][0]) break except: pass print()

I changed the script and separated the state machine segment. The most serious problem with this program IMO is it's just returning the first pattern (you can fix it quickly).

import pandas as pd
import nltk
POSTAG = nltk.pos_tag
df = pd.DataFrame({'text':['high school football players charged after video surfaces showing hazing', 'trump accuser pushes new york to pass the adult survivors act plans to sue']})
for text_id, text in enumerate(df['text'].values):
    
    # Remove the comma and full stops
    text = text.replace(',', '').replace('.', '').replace('-','')
    tokens = nltk.word_tokenize(text.lower())
    sentence_tags = POSTAG(tokens)
    words = [item[0] for item in sentence_tags]
    start_end = []
    temp = 0
    for word in words:
      start_end.append([temp, temp+len(word)])
      temp+= (len(word)+1) 
    tags = [item[1] for item in sentence_tags]
    words_to_print = []
    tags_to_print = []
    start_end_to_print = []
    # the state machine 
    verb = False
    first_noun = False
    second_noun = False
    third_noun = False
    for w, t, se in zip(words, tags, start_end):
      if t.startswith('NN'):
        words_to_print.append(w)
        tags_to_print.append(t)
        start_end_to_print.append(se)
        first_noun = True

      elif t.startswith('NN') and first_noun:
        words_to_print.append(w)
        tags_to_print.append(t)
        start_end_to_print.append(se)
        second_noun = True

      elif t.startswith('NN') and second_noun:
        words_to_print.append(w)
        tags_to_print.append(t)
        start_end_to_print.append(se)
        third_noun = True

      elif t.startswith('VB') and (first_noun or second_noun or third_noun):
        break 
      
      elif (first_noun or second_noun or third_noun):
        words_to_print = []
        tags_to_print = []
        start_end_to_print = []
        verb = False
        first_noun, second_noun, third_noun = False, False, False
    
    print('> ', ' '.join(words_to_print), ' '.join([str(item[0])+' '+str(item[1]) for item in zip(start_end_to_print, tags_to_print)]))

output:

>  school football players [5, 11] NN [12, 20] NN [21, 28] NNS
>  trump accuser [0, 5] NN [6, 13] NN

Recommended topics

Hot tags