Synopsis-Scorer / utils.py
ujwal55's picture
Upload 5 files
1b330b0 verified
import re
import fitz
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import spacy
# Load the English NLP model and the SentenceTransformer model
nlp = spacy.load("en_core_web_sm")
model = SentenceTransformer('all-MiniLM-L6-v2')
def extract_text(file):
if file.name.endswith(".pdf"):
doc = fitz.open(stream=file.read(), filetype="pdf")
return "\n".join([page.get_text() for page in doc])
else:
return file.read().decode("utf-8")
def anonymize_text(text):
doc = nlp(text)
#Collect spaCy-detected entities
replacements = []
for ent in doc.ents:
if ent.label_ == "PERSON":
replacements.append((ent.start_char, ent.end_char, "PERSON"))
elif ent.label_ == "DATE":
replacements.append((ent.start_char, ent.end_char, "DATE"))
elif ent.label_ in ["GPE", "LOC"]:
replacements.append((ent.start_char, ent.end_char, "LOCATION"))
elif ent.label_ == "ORG":
replacements.append((ent.start_char, ent.end_char, "ORG"))
#Add regex-based matches for things spaCy misses
regex_patterns = [
(r"\b[\w\.-]+@[\w\.-]+\.\w+\b", "EMAIL"), # Email
(r"https?://\S+|www\.\S+", "URL"), # URLs
(r"\b\d{10}\b", "PHONE"), # 10-digit phone numbers
(r"\b[A-Z]{2,}\d{6,}\b", "ID"), # Generic IDs (e.g., AA123456)
]
for pattern, label in regex_patterns:
for match in re.finditer(pattern, text):
replacements.append((match.start(), match.end(), label))
replacements.sort(reverse=True)
for start, end, replacement in replacements:
text = text[:start] + f"[{replacement}]" + text[end:] # Adding brackets for clarity
return text
def score_synopsis(article, synopsis):
embeddings = model.encode([article, synopsis])
similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
content_coverage = similarity * 50
clarity = (len(set(synopsis.split())) / max(len(synopsis.split()), 1)) * 25
coherence = min(25, 5 * (len(synopsis.split(".")) - 1))
total = content_coverage + clarity + coherence
return {
"total": round(total, 2),
"content_coverage": round(content_coverage, 2),
"clarity": round(clarity, 2),
"coherence": round(coherence, 2)
}