Spaces:

Debanna
/

Customer-Support-Copilot

Sleeping

Customer-Support-Copilot / src /classifier.py

dasdebanna

Prepare app for Hugging Face Space (include index files)

37a70cc 4 months ago

5.68 kB

	# src/classifier.py
	from typing import Dict, List, Union
	from transformers import pipeline
	import math
	import json
	from pathlib import Path

	# Lazy-loaded pipelines (module-level to reuse)
	_zero_shot_clf = None
	_sentiment_clf = None

	def get_zero_shot_classifier():
	global _zero_shot_clf
	if _zero_shot_clf is None:
	# BART or RoBERTa NLI models are common choices
	_zero_shot_clf = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
	return _zero_shot_clf

	def get_sentiment_classifier():
	global _sentiment_clf
	if _sentiment_clf is None:
	# SST-2 fine-tuned model
	_sentiment_clf = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
	return _sentiment_clf

	# Schema - fixed topic labels requested by the assignment
	TOPIC_LABELS = [
	"How-to",
	"Product",
	"Connector",
	"Lineage",
	"API/SDK",
	"SSO",
	"Glossary",
	"Best practices",
	"Sensitive data"
	]

	# Optionally add synonyms/prompts to nudge zero-shot
	LABEL_DESCRIPTIONS = {
	"How-to": "user asking how to perform a task or request a tutorial",
	"Product": "product feature, UI or general product question",
	"Connector": "questions about connectors, crawlers, integrations and failures",
	"Lineage": "questions about lineage, upstream/downstream or lineage exports",
	"API/SDK": "developer questions about APIs, SDKs, endpoints, code examples",
	"SSO": "authentication, SAML, SSO, Okta, login issues",
	"Glossary": "business glossary, terms, bulk import of glossary terms",
	"Best practices": "request for recommended approach, best practices or governance",
	"Sensitive data": "questions about PII, masking, DLP, secrets"
	}

	def classify_topic_zero_shot(text: str, labels: List[str] = TOPIC_LABELS, hypothesis_template: str = "This text is about {}.") -> Dict:
	"""
	Returns a dictionary with labels and scores from zero-shot classifier.
	"""
	clf = get_zero_shot_classifier()
	# The HF zero-shot pipeline can accept a 'hypothesis_template' to improve results.
	res = clf(sequences=text, candidate_labels=labels, hypothesis_template=hypothesis_template)
	# Example res: {'sequence':..., 'labels': [...], 'scores':[...]}
	# We'll return top N labels above a threshold
	return res

	def classify_sentiment_hf(text: str) -> str:
	"""
	Returns a human-friendly sentiment label, mapping HF outputs to your schema.
	HF model returns POSITIVE/NEGATIVE with a score.
	We'll use a small mapping to Frustrated/Curious/Angry/Neutral/Positive.
	"""
	clf = get_sentiment_classifier()
	out = clf(text[:1000]) # truncate long text for speed
	# out like [{'label': 'NEGATIVE', 'score': 0.999}]
	if not out:
	return "Neutral"
	lab = out[0]["label"].upper()
	score = out[0]["score"]
	# simple mapping
	if lab == "NEGATIVE":
	# distinguish angry vs frustrated by strength
	if score > 0.9:
	return "Angry"
	return "Frustrated"
	elif lab == "POSITIVE":
	if score > 0.9:
	return "Positive"
	return "Curious"
	else:
	return "Neutral"

	# Keep same rule-based priority function (deterministic SLA logic)
	PRIORITY_KEYWORDS_P0 = ["urgent", "asap", "blocked", "blocker", "critical", "production", "failed", "failure", "infuriating", "can't", "cant", "down", "urgent:"]
	PRIORITY_KEYWORDS_P1 = ["need", "important", "deadline", "next week", "approaching", "required", "soon", "high"]

	def classify_priority(text: str, subject: str = "") -> str:
	t = (subject + " " + text).lower()
	for k in PRIORITY_KEYWORDS_P0:
	if k in t:
	return "P0"
	for k in PRIORITY_KEYWORDS_P1:
	if k in t:
	return "P1"
	return "P2"

	def classify_ticket(ticket: Dict, top_k: int = 2, label_score_threshold: float = 0.25) -> Dict:
	"""
	Full classification of a single ticket:
	- topic_tags: top_k labels from zero-shot (above threshold)
	- sentiment: HF sentiment mapped
	- priority: rule-based
	"""
	text = " ".join([ticket.get("subject", ""), ticket.get("body", "")])
	z = classify_topic_zero_shot(text)
	labels = z.get("labels", [])
	scores = z.get("scores", [])
	# Collect top_k labels above threshold
	topic_tags = []
	for lbl, score in zip(labels, scores):
	if score >= label_score_threshold:
	topic_tags.append(lbl)
	if len(topic_tags) >= top_k:
	break
	# fallback: if nothing passes threshold, take the top label
	if not topic_tags and labels:
	topic_tags = [labels[0]]

	sentiment = classify_sentiment_hf(text)
	priority = classify_priority(ticket.get("body",""), ticket.get("subject",""))

	return {
	"id": ticket.get("id"),
	"topic_tags": topic_tags,
	"topic_scores": {lbl: float(s) for lbl, s in zip(labels, scores)},
	"sentiment": sentiment,
	"priority": priority
	}

	# batch classify and save JSON
	def classify_all_and_save(input_path: Union[str, Path] = "../sample_tickets.json", output_path: Union[str, Path] = "../classified_tickets_phase2.json"):
	p_in = Path(__file__).parent.joinpath(input_path).resolve()
	p_out = Path(__file__).parent.joinpath(output_path).resolve()
	tickets = json.loads(p_in.read_text(encoding="utf-8"))
	results = []
	for t in tickets:
	c = classify_ticket(t)
	results.append({**t, "classification": c})
	p_out.write_text(json.dumps(results, indent=2), encoding="utf-8")
	print(f"Saved {len(results)} classified tickets to {p_out}")
	return p_out

	if __name__ == "__main__":
	classify_all_and_save()