keyword / README.md
mclear1's picture
Update README.md
b316eef verified

import nltk from nltk.tokenize import word_tokenize import re

Load the document

with open('document.txt', 'r') as f: text = f.read()

Preprocess the text

tokens = word_tokenize(text.lower()) tokens = [t for t in tokens if t.isalpha()] # remove non-alpha characters

Define key words

key_words = ['chronic kidney disease', 'heart failure', 'cirrhosis', 'ascites', 'ESRD', 'liver disease']

Use regex to find key words

found_key_words = [] for key_word in key_words: pattern = re.compile(r'\b' + key_word + r'\b') if pattern.search(text): found_key_words.append(key_word)

Return the list of key words

print(found_key_words)