dasdebanna
Prepare app for Hugging Face Space (include index files)
37a70cc
# src/classifier.py
from typing import Dict, List, Union
from transformers import pipeline
import math
import json
from pathlib import Path
# Lazy-loaded pipelines (module-level to reuse)
_zero_shot_clf = None
_sentiment_clf = None
def get_zero_shot_classifier():
global _zero_shot_clf
if _zero_shot_clf is None:
# BART or RoBERTa NLI models are common choices
_zero_shot_clf = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
return _zero_shot_clf
def get_sentiment_classifier():
global _sentiment_clf
if _sentiment_clf is None:
# SST-2 fine-tuned model
_sentiment_clf = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
return _sentiment_clf
# Schema - fixed topic labels requested by the assignment
TOPIC_LABELS = [
"How-to",
"Product",
"Connector",
"Lineage",
"API/SDK",
"SSO",
"Glossary",
"Best practices",
"Sensitive data"
]
# Optionally add synonyms/prompts to nudge zero-shot
LABEL_DESCRIPTIONS = {
"How-to": "user asking how to perform a task or request a tutorial",
"Product": "product feature, UI or general product question",
"Connector": "questions about connectors, crawlers, integrations and failures",
"Lineage": "questions about lineage, upstream/downstream or lineage exports",
"API/SDK": "developer questions about APIs, SDKs, endpoints, code examples",
"SSO": "authentication, SAML, SSO, Okta, login issues",
"Glossary": "business glossary, terms, bulk import of glossary terms",
"Best practices": "request for recommended approach, best practices or governance",
"Sensitive data": "questions about PII, masking, DLP, secrets"
}
def classify_topic_zero_shot(text: str, labels: List[str] = TOPIC_LABELS, hypothesis_template: str = "This text is about {}.") -> Dict:
"""
Returns a dictionary with labels and scores from zero-shot classifier.
"""
clf = get_zero_shot_classifier()
# The HF zero-shot pipeline can accept a 'hypothesis_template' to improve results.
res = clf(sequences=text, candidate_labels=labels, hypothesis_template=hypothesis_template)
# Example res: {'sequence':..., 'labels': [...], 'scores':[...]}
# We'll return top N labels above a threshold
return res
def classify_sentiment_hf(text: str) -> str:
"""
Returns a human-friendly sentiment label, mapping HF outputs to your schema.
HF model returns POSITIVE/NEGATIVE with a score.
We'll use a small mapping to Frustrated/Curious/Angry/Neutral/Positive.
"""
clf = get_sentiment_classifier()
out = clf(text[:1000]) # truncate long text for speed
# out like [{'label': 'NEGATIVE', 'score': 0.999}]
if not out:
return "Neutral"
lab = out[0]["label"].upper()
score = out[0]["score"]
# simple mapping
if lab == "NEGATIVE":
# distinguish angry vs frustrated by strength
if score > 0.9:
return "Angry"
return "Frustrated"
elif lab == "POSITIVE":
if score > 0.9:
return "Positive"
return "Curious"
else:
return "Neutral"
# Keep same rule-based priority function (deterministic SLA logic)
PRIORITY_KEYWORDS_P0 = ["urgent", "asap", "blocked", "blocker", "critical", "production", "failed", "failure", "infuriating", "can't", "cant", "down", "urgent:"]
PRIORITY_KEYWORDS_P1 = ["need", "important", "deadline", "next week", "approaching", "required", "soon", "high"]
def classify_priority(text: str, subject: str = "") -> str:
t = (subject + " " + text).lower()
for k in PRIORITY_KEYWORDS_P0:
if k in t:
return "P0"
for k in PRIORITY_KEYWORDS_P1:
if k in t:
return "P1"
return "P2"
def classify_ticket(ticket: Dict, top_k: int = 2, label_score_threshold: float = 0.25) -> Dict:
"""
Full classification of a single ticket:
- topic_tags: top_k labels from zero-shot (above threshold)
- sentiment: HF sentiment mapped
- priority: rule-based
"""
text = " ".join([ticket.get("subject", ""), ticket.get("body", "")])
z = classify_topic_zero_shot(text)
labels = z.get("labels", [])
scores = z.get("scores", [])
# Collect top_k labels above threshold
topic_tags = []
for lbl, score in zip(labels, scores):
if score >= label_score_threshold:
topic_tags.append(lbl)
if len(topic_tags) >= top_k:
break
# fallback: if nothing passes threshold, take the top label
if not topic_tags and labels:
topic_tags = [labels[0]]
sentiment = classify_sentiment_hf(text)
priority = classify_priority(ticket.get("body",""), ticket.get("subject",""))
return {
"id": ticket.get("id"),
"topic_tags": topic_tags,
"topic_scores": {lbl: float(s) for lbl, s in zip(labels, scores)},
"sentiment": sentiment,
"priority": priority
}
# batch classify and save JSON
def classify_all_and_save(input_path: Union[str, Path] = "../sample_tickets.json", output_path: Union[str, Path] = "../classified_tickets_phase2.json"):
p_in = Path(__file__).parent.joinpath(input_path).resolve()
p_out = Path(__file__).parent.joinpath(output_path).resolve()
tickets = json.loads(p_in.read_text(encoding="utf-8"))
results = []
for t in tickets:
c = classify_ticket(t)
results.append({**t, "classification": c})
p_out.write_text(json.dumps(results, indent=2), encoding="utf-8")
print(f"Saved {len(results)} classified tickets to {p_out}")
return p_out
if __name__ == "__main__":
classify_all_and_save()