Tether / app.py
SamanthaStorm's picture
Update app.py
036dae9 verified
raw
history blame
6.57 kB
import gradio as gr
import torch
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from motif_tagging import detect_motifs
# custom fine-tuned sentiment model
sentiment_model = AutoModelForSequenceClassification.from_pretrained("SamanthaStorm/tether-sentiment")
sentiment_tokenizer = AutoTokenizer.from_pretrained("SamanthaStorm/tether-sentiment")
# Load abuse pattern model
model_name ="SamanthaStorm/autotrain-jlpi4-mllvp"
model = RobertaForSequenceClassification.from_pretrained(model_name, trust_remote_code=True)
tokenizer = RobertaTokenizer.from_pretrained(model_name, trust_remote_code=True)
LABELS = [
"gaslighting", "mockery", "dismissiveness", "control", "guilt_tripping", "apology_baiting", "blame_shifting", "projection",
"contradictory_statements", "manipulation", "deflection", "insults", "obscure_formal", "recovery_phase"
]
THRESHOLDS = {
"gaslighting": 0.25, "mockery": 0.15, "dismissiveness": 0.45, "control": 0.43, "guilt_tripping": 0.15,
"apology_baiting": 0.2, "blame_shifting": 0.23, "projection": 0.50, "contradictory_statements": 0.25,
"manipulation": 0.25, "deflection": 0.30, "insults": 0.34, "obscure_formal": 0.25, "recovery_phase": 0.25
}
EXPLANATIONS = {
"gaslighting": "Gaslighting involves making someone question their own reality or perceptions...",
"blame_shifting": "Blame-shifting is when one person redirects the responsibility...",
"projection": "Projection involves accusing the victim of behaviors the abuser exhibits.",
"dismissiveness": "Dismissiveness is belittling or disregarding another person’s feelings.",
"mockery": "Mockery ridicules someone in a hurtful, humiliating way.",
"recovery_phase": "Recovery phase dismisses someone's emotional healing process.",
"insults": "Insults are derogatory remarks aimed at degrading someone.",
"apology_baiting": "Apology-baiting manipulates victims into apologizing for abuser's behavior.",
"deflection": "Deflection avoids accountability by redirecting blame.",
"control": "Control restricts autonomy through manipulation or coercion.",
"guilt_tripping": "Guilt-tripping uses guilt to manipulate someone’s actions.",
"manipulation": "Manipulation deceives to influence or control outcomes.",
"obscure_formal": "Obscure/formal language manipulates through confusion or superiority."
}
PATTERN_WEIGHTS = {
"gaslighting": 1.3, "mockery": 1.2, "control": 1.2, "dismissiveness": 0.8
}
def custom_sentiment(text):
inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
outputs = sentiment_model(**inputs)
probs = torch.nn.functional.softmax(outputs.logits, dim=1)
label_idx = torch.argmax(probs).item()
label_map = {0: "supportive", 1: "undermining"}
return {"label": label_map[label_idx], "score": probs[0][label_idx].item()}
def calculate_abuse_level(scores, thresholds, motif_hits=None, flag_multiplier=1.0):
weighted_scores = [score * PATTERN_WEIGHTS.get(label, 1.0) for label, score in zip(LABELS, scores) if score > thresholds[label]]
base_score = round(np.mean(weighted_scores) * 100, 2) if weighted_scores else 0.0
base_score *= flag_multiplier
return min(base_score, 100.0)
def interpret_abuse_level(score):
if score > 80:
return "Extreme / High Risk"
elif score > 60:
return "Severe / Harmful Pattern Present"
elif score > 40:
return "Likely Abuse"
elif score > 20:
return "Mild Concern"
return "Very Low / Likely Safe"
def analyze_single_message(text, thresholds, motif_flags):
motif_hits, matched_phrases = detect_motifs(text)
sentiment = custom_sentiment(text)
adjusted_thresholds = {k: v * 0.8 for k, v in thresholds.items()} if sentiment['label'] == "undermining" else thresholds.copy()
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
outputs = model(**inputs)
scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()
threshold_labels = [label for label, score in zip(LABELS, scores) if score > adjusted_thresholds[label]]
phrase_labels = [label for label, _ in matched_phrases]
pattern_labels_used = list(set(threshold_labels + phrase_labels))
abuse_level = calculate_abuse_level(scores, adjusted_thresholds, motif_hits)
top_patterns = sorted([(label, score) for label, score in zip(LABELS, scores)], key=lambda x: x[1], reverse=True)[:2]
return abuse_level, pattern_labels_used, top_patterns
def analyze_composite(msg1, msg2, msg3, flags):
thresholds = THRESHOLDS
messages = [msg1, msg2, msg3]
active_messages = [m for m in messages if m.strip()]
if not active_messages:
return "Please enter at least one message."
results = [analyze_single_message(m, thresholds, flags) for m in active_messages]
abuse_scores = [r[0] for r in results]
base_score = sum(abuse_scores) / len(abuse_scores)
label_sets = [[label for label, _ in r[2]] for r in results]
label_counts = {label: sum(label in s for s in label_sets) for label in set().union(*label_sets)}
top_labels = sorted(label_counts.items(), key=lambda x: x[1], reverse=True)[:2]
top_explanations = [EXPLANATIONS.get(label, "") for label, _ in top_labels]
# Adjust flag-based weight relative to number of messages
danger_weight = 5
flag_boost = (len(flags) * danger_weight) / len(active_messages)
composite_score = min(base_score + flag_boost, 100)
composite_score = round(composite_score, 2)
result = f"These messages show patterns of {', '.join(label for label, _ in top_labels)} and are estimated to be {composite_score}% likely abusive."
for expl in top_explanations:
if expl:
result += f"\n• {expl}"
return result
textbox_inputs = [
gr.Textbox(label="Message 1"),
gr.Textbox(label="Message 2"),
gr.Textbox(label="Message 3")
]
checkboxes = gr.CheckboxGroup(label="Contextual Flags", choices=[
"They've threatened harm", "They isolate me", "I’ve changed my behavior out of fear",
"They monitor/follow me", "I feel unsafe when alone with them"
])
iface = gr.Interface(
fn=analyze_composite,
inputs=textbox_inputs + [checkboxes],
outputs=gr.Textbox(label="Results"),
title="Abuse Pattern Detector (Multi-Message)",
allow_flagging="manual"
)
if __name__ == "__main__":
iface.launch()