|
|
import os |
|
|
import re |
|
|
import spacy |
|
|
import numpy as np |
|
|
import language_tool_python |
|
|
from sentence_transformers import SentenceTransformer, util, CrossEncoder |
|
|
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer |
|
|
from textstat import textstat |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if os.path.exists("/usr/lib/jvm/java-17-openjdk-amd64"): |
|
|
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-17-openjdk-amd64" |
|
|
|
|
|
print("Loading models... this may take a moment.") |
|
|
|
|
|
|
|
|
try: |
|
|
nlp = spacy.load("en_core_web_sm") |
|
|
except OSError: |
|
|
print("Spacy model not found. Downloading...") |
|
|
spacy.cli.download("en_core_web_sm") |
|
|
nlp = spacy.load("en_core_web_sm") |
|
|
|
|
|
|
|
|
sbert_model = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
grammar_tool = language_tool_python.LanguageTool('en-US') |
|
|
sentiment_analyzer = SentimentIntensityAnalyzer() |
|
|
|
|
|
nli_model = CrossEncoder('cross-encoder/stsb-distilroberta-base') |
|
|
|
|
|
print("Models loaded successfully.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
RUBRIC = { |
|
|
"salutation": { |
|
|
"normal": ["hi", "hello"], |
|
|
"good": ["good morning", "good afternoon", "good evening", "good day", "hello everyone"], |
|
|
"excellent": ["excited to introduce", "feeling great", "pleasure to introduce", "greetings"] |
|
|
}, |
|
|
"content": { |
|
|
"must_have": { |
|
|
"points": 4, |
|
|
"topics": ["Name", "Age", "School/Class", "Family", "Hobbies/Interests"] |
|
|
}, |
|
|
"good_to_have": { |
|
|
"points": 2, |
|
|
"topics": ["Origin/Location", "Ambition/Goal", "Fun Fact/Unique", "Strengths", "Achievements"] |
|
|
} |
|
|
}, |
|
|
"speech_rate": { |
|
|
"fast_threshold": 160, |
|
|
"ideal_min": 111, |
|
|
"ideal_max": 140, |
|
|
"slow_threshold": 80 |
|
|
}, |
|
|
"fillers": ["um", "uh", "like", "you know", "actually", "basically", "right", "i mean", "well", "kinda", "sort of", "hmm"] |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class IntroductionScorer: |
|
|
def __init__(self, transcript_text, audio_duration_sec=None): |
|
|
self.text = transcript_text |
|
|
self.doc = nlp(transcript_text) |
|
|
self.provided_duration = float(audio_duration_sec) if audio_duration_sec else 0 |
|
|
|
|
|
self.duration_min = (self.provided_duration / 60) if self.provided_duration else 0 |
|
|
self.sentences = [sent.text.strip() for sent in self.doc.sents] |
|
|
self.words = [token.text.lower() for token in self.doc if not token.is_punct] |
|
|
self.total_words = len(self.words) |
|
|
|
|
|
def score_salutation(self): |
|
|
text_lower = self.text.lower() |
|
|
|
|
|
for phrase in RUBRIC["salutation"]["excellent"]: |
|
|
if phrase in text_lower: |
|
|
return 5, f"Excellent salutation used: '{phrase}'" |
|
|
|
|
|
for phrase in RUBRIC["salutation"]["good"]: |
|
|
if phrase in text_lower: |
|
|
return 4, f"Good salutation used: '{phrase}'" |
|
|
|
|
|
for word in RUBRIC["salutation"]["normal"]: |
|
|
if word in text_lower: |
|
|
return 2, "Basic salutation used (Hi/Hello). Try to be more formal." |
|
|
|
|
|
return 0, "No salutation found." |
|
|
|
|
|
def score_content(self): |
|
|
scores = 0 |
|
|
feedback = [] |
|
|
|
|
|
|
|
|
regex_name = r"\b(name\s+is|i\s+am|i[\s'’]*m|myself|this\s+is)\s+([A-Z])" |
|
|
regex_age = r"\b(\d+|thirteen|fourteen|fifteen|sixteen)\s*(-)?\s*(years|yrs)\b" |
|
|
regex_school = r"\b(class|grade|standard|school|college|university|study|student)\b" |
|
|
|
|
|
if re.search(regex_name, self.text, re.IGNORECASE): |
|
|
scores += 4; feedback.append("[+] Name") |
|
|
else: feedback.append("[-] Name") |
|
|
|
|
|
if re.search(regex_age, self.text, re.IGNORECASE): |
|
|
scores += 4; feedback.append("[+] Age") |
|
|
else: feedback.append("[-] Age") |
|
|
|
|
|
if re.search(regex_school, self.text, re.IGNORECASE): |
|
|
scores += 4; feedback.append("[+] School") |
|
|
else: feedback.append("[-] School") |
|
|
|
|
|
|
|
|
def check_topic_robust(regex, anchors, use_ai=True): |
|
|
|
|
|
if re.search(regex, self.text, re.IGNORECASE): return True |
|
|
|
|
|
|
|
|
if use_ai and self.sentences: |
|
|
topic_emb = sbert_model.encode(anchors, convert_to_tensor=True) |
|
|
text_emb = sbert_model.encode(self.sentences, convert_to_tensor=True) |
|
|
|
|
|
best_score = float(util.cos_sim(text_emb, topic_emb).max()) |
|
|
return best_score > 0.35 |
|
|
return False |
|
|
|
|
|
|
|
|
if check_topic_robust(r"\b(family|parents|mother|father|siblings)\b", ["My family", "I live with"]): |
|
|
scores += 4; feedback.append("[+] Family") |
|
|
else: feedback.append("[-] Family") |
|
|
|
|
|
|
|
|
if check_topic_robust(r"\b(hobby|hobbies|enjoy|like\s+(to|playing|reading)|pastime)\b", ["My hobby is", "I enjoy"]): |
|
|
scores += 4; feedback.append("[+] Hobbies") |
|
|
else: feedback.append("[-] Hobbies") |
|
|
|
|
|
|
|
|
bonuses = { |
|
|
"Ambition": (r"\b(goal|ambition|dream|want\s+to\s+be)\b", ["I want to become"], True), |
|
|
"Strength": (r"\b(strength|good\s+at|confident)\b", ["My strength is"], True), |
|
|
"Unique": (r"\b(unique|special|fun\s+fact)\b", ["fun fact"], True), |
|
|
"Origin": (r"\b(i\s+am\s+from|i['’]m\s+from|originally\s+from|live\s+in|living\s+in|born\s+in|hometown|native)\b", [], False), |
|
|
"Achievements": (r"\b(won|achievement|award)\b", ["I won"], True) |
|
|
} |
|
|
|
|
|
for topic, (reg, anc, use_ai_flag) in bonuses.items(): |
|
|
if check_topic_robust(reg, anc, use_ai=use_ai_flag): |
|
|
scores += 2; feedback.append(f"[+] {topic}") |
|
|
|
|
|
return min(30, scores), ", ".join(feedback) |
|
|
|
|
|
def score_flow(self): |
|
|
anchors = { |
|
|
"salutation": ["Hello everyone", "Good morning", "Hi", "Greetings"], |
|
|
"intro": ["My name is", "I am", "I'm", "I’m", "Myself", "This is"], |
|
|
"closing": ["Thank you", "Thanks", "That is all", "The end"], |
|
|
"body": ["family", "mother", "school", "class", "hobby", "playing", "dream", "goal"] |
|
|
} |
|
|
|
|
|
if not self.sentences: return 0, "No text" |
|
|
|
|
|
text_emb = sbert_model.encode(self.sentences, convert_to_tensor=True) |
|
|
|
|
|
def get_idx(key, thresh=0.25): |
|
|
anc = sbert_model.encode(anchors[key], convert_to_tensor=True) |
|
|
sims = util.cos_sim(text_emb, anc).max(dim=1).values |
|
|
best_idx = int(sims.argmax()) |
|
|
best_score = float(sims.max()) |
|
|
return best_idx, best_score > thresh |
|
|
|
|
|
idx_s, has_s = get_idx("salutation", 0.25) |
|
|
idx_i, has_i = get_idx("intro", 0.25) |
|
|
idx_c, has_c = get_idx("closing", 0.30) |
|
|
|
|
|
|
|
|
has_body = False |
|
|
if has_i and has_c and idx_c > idx_i: |
|
|
if idx_c - idx_i >= 1: |
|
|
mid_sents = self.sentences[idx_i+1 : idx_c] |
|
|
if mid_sents: |
|
|
mid_emb = sbert_model.encode(mid_sents, convert_to_tensor=True) |
|
|
bod_emb = sbert_model.encode(anchors["body"], convert_to_tensor=True) |
|
|
if util.cos_sim(mid_emb, bod_emb).max() > 0.25: has_body = True |
|
|
|
|
|
debug_info = f"(Indices: Sal={idx_s if has_s else 'X'}, Intro={idx_i if has_i else 'X'}, End={idx_c if has_c else 'X'})" |
|
|
|
|
|
if has_s and has_c: |
|
|
if has_i: |
|
|
if idx_s <= idx_i < idx_c: |
|
|
return (5, "Perfect Flow") if has_body else (5, "Good Flow (Short body)") |
|
|
if idx_i == idx_c: |
|
|
return 0, f"Disordered: Introduction and Closing are detected in same sentence. {debug_info}" |
|
|
|
|
|
elif idx_s < idx_c: |
|
|
return (5, "Good Flow") if has_body else (5, "Acceptable Flow") |
|
|
|
|
|
return 0, f"Flow disordered. {debug_info}" |
|
|
|
|
|
def score_speech_rate(self): |
|
|
if not self.provided_duration: |
|
|
return 10, "Duration not provided (Assumed Ideal)" |
|
|
|
|
|
wpm = self.total_words / self.duration_min if self.duration_min > 0 else 0 |
|
|
|
|
|
if 111 <= wpm <= 140: return 10, f"Ideal ({int(wpm)} WPM)" |
|
|
if 81 <= wpm <= 160: return 6, f"Acceptable ({int(wpm)} WPM)" |
|
|
if wpm > 140: return 2, f"Too Fast ({int(wpm)} WPM)" |
|
|
if wpm < 81: return 2, f"Too Slow ({int(wpm)} WPM)" |
|
|
|
|
|
return 2, f"Poor Pacing ({int(wpm)} WPM)" |
|
|
|
|
|
def score_grammar(self): |
|
|
try: |
|
|
matches = grammar_tool.check(self.text) |
|
|
scoring_errors = [] |
|
|
ignored_issues = [] |
|
|
|
|
|
|
|
|
for m in matches: |
|
|
rid = getattr(m, 'ruleId', '').upper() |
|
|
msg = getattr(m, 'message', '').lower() |
|
|
replacements = getattr(m, 'replacements', []) |
|
|
|
|
|
offset = getattr(m, 'offset', 0) |
|
|
length = getattr(m, 'errorLength', getattr(m, 'length', 5)) |
|
|
error_text = self.text[offset : offset + length] |
|
|
|
|
|
is_ignored = False |
|
|
|
|
|
|
|
|
if replacements: |
|
|
top_rep = replacements[0] |
|
|
if "-" in top_rep and top_rep.replace("-", "") == error_text.replace(" ", ""): |
|
|
is_ignored = True |
|
|
|
|
|
|
|
|
ignore_keywords = [ |
|
|
"hyphen", "compound", "joined", "whitespace", "comma", "punctuation", |
|
|
"spelling", "typo", "morfologik", "uppercase", "capitalization", |
|
|
"repetition", "consecutive", "successive", "same word", |
|
|
"style", "wordiness", "sentence start", "rewording", "thesaurus" |
|
|
] |
|
|
|
|
|
if any(k in msg or k in rid.lower() for k in ignore_keywords): |
|
|
is_ignored = True |
|
|
|
|
|
if is_ignored: ignored_issues.append(m) |
|
|
else: scoring_errors.append(m) |
|
|
|
|
|
|
|
|
err_count = len(scoring_errors) |
|
|
errors_per_100 = (err_count / self.total_words) * 100 if self.total_words > 0 else 0 |
|
|
|
|
|
|
|
|
grammar_metric = 1 - min(errors_per_100 / 5, 1) |
|
|
|
|
|
if grammar_metric > 0.9: s=10; g="Flawless" |
|
|
elif grammar_metric >= 0.7: s=8; g="Good" |
|
|
elif grammar_metric >= 0.5: s=6; g="Average" |
|
|
elif grammar_metric >= 0.3: s=4; g="Needs Improvement" |
|
|
else: s=2; g="Poor" |
|
|
|
|
|
|
|
|
fb_lines = [] |
|
|
fb_lines.append(f"{g} (Score: {s}/10)") |
|
|
fb_lines.append("NOTE: Spelling, hyphens, punctuation, and style ignored.") |
|
|
|
|
|
if scoring_errors: |
|
|
fb_lines.append(f"\n[CRITICAL GRAMMAR ERRORS] ({len(scoring_errors)} found):") |
|
|
for m in scoring_errors[:3]: |
|
|
off = getattr(m, 'offset', 0) |
|
|
ln = getattr(m, 'errorLength', getattr(m, 'length', 5)) |
|
|
ctx = self.text[off : off+ln+10].replace('\n', ' ') |
|
|
fb_lines.append(f" - {m.message} (Context: '...{ctx}...')") |
|
|
else: |
|
|
fb_lines.append("\n[CRITICAL GRAMMAR ERRORS]: None.") |
|
|
|
|
|
if ignored_issues: |
|
|
fb_lines.append(f"\n[IGNORED ISSUES] ({len(ignored_issues)} found):") |
|
|
for m in ignored_issues[:3]: |
|
|
msg = getattr(m, 'message', 'Issue') |
|
|
off = getattr(m, 'offset', 0) |
|
|
ln = getattr(m, 'errorLength', getattr(m, 'length', 5)) |
|
|
ctx = self.text[off : off+ln+10].replace('\n', ' ') |
|
|
fb_lines.append(f" - {msg} (Context: '...{ctx}...')") |
|
|
|
|
|
return s, "\n".join(fb_lines) |
|
|
|
|
|
except Exception as e: |
|
|
return 5, f"Error during grammar check: {str(e)}" |
|
|
|
|
|
def score_vocabulary(self): |
|
|
distinct_words = len(set(self.words)) |
|
|
ttr = distinct_words / self.total_words if self.total_words > 0 else 0 |
|
|
|
|
|
if ttr >= 0.9: return 10, f"Excellent variety (TTR: {ttr:.2f})" |
|
|
elif ttr >= 0.7: return 8, f"Good variety (TTR: {ttr:.2f})" |
|
|
elif ttr >= 0.5: return 6, f"Average variety (TTR: {ttr:.2f})" |
|
|
elif ttr >= 0.3: return 4, f"Repetitive (TTR: {ttr:.2f})" |
|
|
else: return 2, f"Very repetitive (TTR: {ttr:.2f})" |
|
|
|
|
|
def score_clarity(self): |
|
|
filler_count = 0 |
|
|
for word in self.words: |
|
|
if word in RUBRIC["fillers"]: |
|
|
filler_count += 1 |
|
|
|
|
|
filler_rate = (filler_count / self.total_words) * 100 if self.total_words > 0 else 0 |
|
|
|
|
|
if filler_rate <= 3: return 15, f"Clear speech ({filler_count} fillers)" |
|
|
elif filler_rate <= 6: return 12, f"Mostly clear ({filler_count} fillers)" |
|
|
elif filler_rate <= 9: return 9, f"Some hesitation ({filler_count} fillers)" |
|
|
elif filler_rate <= 12: return 6, f"Hesitant ({filler_count} fillers)" |
|
|
else: return 3, f"Distracted by fillers ({filler_count} fillers)" |
|
|
|
|
|
def score_engagement(self): |
|
|
vs = sentiment_analyzer.polarity_scores(self.text) |
|
|
|
|
|
|
|
|
prob = (vs['compound'] + 1) / 2 |
|
|
|
|
|
high_energy_kws = [ |
|
|
"excited", "thrilled", "passionate", "delighted", "honor", |
|
|
"love", "amazing", "wonderful", "fantastic", "energetic", |
|
|
"grateful", "confident", "pleasure" |
|
|
] |
|
|
|
|
|
has_enthusiasm = any(w in self.text.lower() for w in high_energy_kws) |
|
|
|
|
|
|
|
|
if prob >= 0.9 and not has_enthusiasm: |
|
|
prob = 0.88 |
|
|
|
|
|
if prob >= 0.9: |
|
|
return 15, f"Very Engaging (Sentiment: {prob:.2f})" |
|
|
elif prob >= 0.7: |
|
|
return 12, f"Positive (Sentiment: {prob:.2f})" |
|
|
elif prob >= 0.5: |
|
|
return 9, f"Neutral (Sentiment: {prob:.2f})" |
|
|
elif prob >= 0.3: |
|
|
return 6, f"Slightly Negative (Sentiment: {prob:.2f})" |
|
|
else: |
|
|
return 3, f"Negative (Sentiment: {prob:.2f})" |
|
|
|
|
|
def calculate_overall_score(self): |
|
|
s_salutation, f_salutation = self.score_salutation() |
|
|
s_content, f_content = self.score_content() |
|
|
s_flow, f_flow = self.score_flow() |
|
|
s_rate, f_rate = self.score_speech_rate() |
|
|
s_grammar, f_grammar = self.score_grammar() |
|
|
s_vocab, f_vocab = self.score_vocabulary() |
|
|
s_clarity, f_clarity = self.score_clarity() |
|
|
s_engage, f_engage = self.score_engagement() |
|
|
|
|
|
total_score = ( |
|
|
s_salutation + s_content + s_flow + s_rate + |
|
|
s_grammar + s_vocab + s_clarity + s_engage |
|
|
) |
|
|
|
|
|
return { |
|
|
"Total Score": total_score, |
|
|
"Breakdown": { |
|
|
"Salutation": {"score": s_salutation, "max": 5, "feedback": f_salutation}, |
|
|
"Content & Structure": {"score": s_content, "max": 30, "feedback": f_content}, |
|
|
"Flow": {"score": s_flow, "max": 5, "feedback": f_flow}, |
|
|
"Speech Rate": {"score": s_rate, "max": 10, "feedback": f_rate}, |
|
|
"Grammar": {"score": s_grammar, "max": 10, "feedback": f_grammar}, |
|
|
"Vocabulary": {"score": s_vocab, "max": 10, "feedback": f_vocab}, |
|
|
"Clarity (Fillers)": {"score": s_clarity, "max": 15, "feedback": f_clarity}, |
|
|
"Engagement": {"score": s_engage, "max": 15, "feedback": f_engage}, |
|
|
} |
|
|
} |