ToS-Summarization / keyphrase_extraction.py
EE21's picture
Update keyphrase_extraction.py
c68f9df verified
from rake_nltk import Rake
import nltk
import re
# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
# Define a list of obligation words
obligation_words = [
# English words
"must", "will", "use", "may", "provides", 'is obliged to',
'has to', 'needs to', 'is required to', "shall",
"should", "ought to", "required", "obligated", "duty",
"agrees to", "has a duty to", "is expected to", "commits to",
# German words
"muss", "wird", "nutzen", "darf", "stellt bereit",
"ist verpflichtet", "ist erforderlich", "soll",
"sollte", "erforderlich", "verpflichtet", "Pflicht",
"stimmt zu", "hat die Pflicht", "wird erwartet", "verpflichtet sich"
]
def extract_sentences_with_obligations(text):
# Initialize Rake with stopwords set to None (to keep all words)
rake = Rake()
# Split the text into sentences
sentences = re.split(r'(?<=[.!?])\s+', text)
# Initialize a list to store sentences with obligation words
obligation_sentences = []
# Iterate through the sentences
for sentence in sentences:
# Extract keyphrases from the sentence
rake.extract_keywords_from_text(sentence)
# Get the ranked keyphrases
ranked_keyphrases = rake.get_ranked_phrases()
# Check if any of the ranked keyphrases contain obligation words
if any(any(word in kp.lower() for word in obligation_words) for kp in ranked_keyphrases):
obligation_sentences.append(sentence)
# Join the sentences into a single string
return ' '.join(obligation_sentences)