Spaces:

pavankumarvk
/

Multi_Modal_Deepfake_Detection

Sleeping

App Files Files Community

pavankumarvk commited on 14 days ago

Commit

28c4d49

verified ·

1 Parent(s): 90121fd

Update text_detector_inference.py

Browse files

Files changed (1) hide show

text_detector_inference.py +117 -37

text_detector_inference.py CHANGED Viewed

@@ -2,68 +2,96 @@
 text_detector_inference.py
 ==========================
 Inference wrapper for HybridAITextDetector.
-Designed to be imported by app.py (Gradio) in the Hugging Face Space.
 Usage
 -----
 from text_detector_inference import TextDetectorInference
-detector = TextDetectorInference(
-    checkpoint="best_text_detector.pt",
-    threshold=0.5
-)
-result = detector.predict("Some text here...")
 """
 import os
 import torch
-from transformers import AutoTokenizer
 from text_detector_model import HybridAITextDetector, MODEL_NAME, MAX_LENGTH
 class TextDetectorInference:
     """
-    Thin wrapper around HybridAITextDetector for single-text prediction.
     Parameters
     ----------
     checkpoint : str
-        Path to the .pt state-dict file.
     threshold  : float
         Decision boundary for the sigmoid probability (default 0.5).
-        Set to the optimal F1 threshold found during evaluation.
-    device     : torch.device or None
         Auto-detects CUDA if None.
     """
     def __init__(
         self,
-        checkpoint: str  = "best_text_detector.pt",
         threshold:  float = 0.5,
         device: torch.device = None,
     ):
-        self.threshold = threshold
-        self.device    = device or torch.device(
             "cuda" if torch.cuda.is_available() else "cpu"
         )
-        print(f"[TextDetector] Loading tokenizer from {MODEL_NAME}...")
-        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
         if os.path.exists(checkpoint):
-            print(f"[TextDetector] Loading checkpoint: {checkpoint}")
             self.model = HybridAITextDetector()
             self.model.load_state_dict(
                 torch.load(checkpoint, map_location=self.device)
             )
             self.model.eval().to(self.device)
-            print("[TextDetector] ✅ Model ready")
         else:
-            print(f"[TextDetector] ⚠️  Checkpoint '{checkpoint}' not found. "
-                  "Model NOT loaded — predictions will fail.")
-            self.model = None
-    # ------------------------------------------------------------------
     def predict(self, text: str) -> dict:
         """
         Classify a single text string.
@@ -72,17 +100,31 @@ class TextDetectorInference:
         -------
         dict with keys:
             label      : "AI-Generated" or "Human-Written"
-            confidence : probability of the predicted class (0-1)
             ai_prob    : raw P(AI-generated)
             human_prob : 1 - ai_prob
         """
-        if self.model is None:
-            return {"error": "Model not loaded — missing checkpoint file."}
         text = text.strip()
         if not text:
             return {"error": "Input text is empty."}
         enc = self.tokenizer(
             text,
             truncation=True,
@@ -90,7 +132,6 @@ class TextDetectorInference:
             max_length=MAX_LENGTH,
             return_tensors="pt",
         )
         input_ids      = enc["input_ids"].to(self.device)
         attention_mask = enc["attention_mask"].to(self.device)
         token_type_ids = enc.get(
@@ -99,8 +140,8 @@ class TextDetectorInference:
         ).to(self.device)
         with torch.no_grad():
-            logit    = self.model(input_ids, attention_mask, token_type_ids)
-            ai_prob  = torch.sigmoid(logit).item()
         human_prob = 1.0 - ai_prob
         is_ai      = ai_prob >= self.threshold
@@ -112,20 +153,59 @@ class TextDetectorInference:
             "confidence": round(confidence, 4),
             "ai_prob":    round(ai_prob, 4),
             "human_prob": round(human_prob, 4),
         }
-    # ------------------------------------------------------------------
-    def predict_batch(self, texts: list[str]) -> list[dict]:
         """Run predict() on a list of texts. Returns list of result dicts."""
         return [self.predict(t) for t in texts]
-    # ------------------------------------------------------------------
-    def format_for_gradio(self, text: str) -> tuple[str, float, dict]:
         """
-        Convenience wrapper that returns values in a Gradio-friendly format:
             (label_string, confidence_float, full_result_dict)
         """
         result = self.predict(text)
         if "error" in result:
             return result["error"], 0.0, result
-        return result["label"], result["confidence"], result

 text_detector_inference.py
 ==========================
 Inference wrapper for HybridAITextDetector.
+Strategy
+--------
+1. If  ``best_text_detector.pt``  exists  →  load the custom trained model.
+2. Otherwise                               →  fall back to a pretrained
+   HuggingFace AI-text detector so the Space keeps working immediately.
 Usage
 -----
 from text_detector_inference import TextDetectorInference
+detector = TextDetectorInference()          # auto-detects checkpoint
+result   = detector.predict("Some text…")
 """
 import os
 import torch
+from transformers import AutoTokenizer, pipeline as hf_pipeline
 from text_detector_model import HybridAITextDetector, MODEL_NAME, MAX_LENGTH
+# ─── Fallback model ───────────────────────────────────────────────────────────
+# Used when best_text_detector.pt is not present in the Space.
+# "Hello-SimpleAI/chatgpt-detector-roberta" is a publicly available,
+# well-validated AI-text detector (RoBERTa fine-tuned on ChatGPT outputs).
+FALLBACK_MODEL_ID = "Hello-SimpleAI/chatgpt-detector-roberta"
 class TextDetectorInference:
     """
+    Thin wrapper around HybridAITextDetector (or a fallback pretrained model)
+    for single-text prediction.
     Parameters
     ----------
     checkpoint : str
+        Path to the .pt state-dict file for the custom model.
     threshold  : float
         Decision boundary for the sigmoid probability (default 0.5).
+        Set to the optimal F1 threshold found during your training run.
+    device     : torch.device | None
         Auto-detects CUDA if None.
     """
     def __init__(
         self,
+        checkpoint: str   = "best_text_detector.pt",
         threshold:  float = 0.5,
         device: torch.device = None,
     ):
+        self.threshold   = threshold
+        self.device      = device or torch.device(
             "cuda" if torch.cuda.is_available() else "cpu"
         )
+        self._use_custom = False
+        self._fallback   = None
+        self.model       = None
+        self.tokenizer   = None
         if os.path.exists(checkpoint):
+            # ── Load custom trained HybridAITextDetector ──────────────────────
+            print(f"[TextDetector] ✅ Checkpoint found: {checkpoint}")
+            print(f"[TextDetector] Loading tokenizer from {MODEL_NAME} …")
+            self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
             self.model = HybridAITextDetector()
             self.model.load_state_dict(
                 torch.load(checkpoint, map_location=self.device)
             )
             self.model.eval().to(self.device)
+            self._use_custom = True
+            print("[TextDetector] ✅ Custom model ready")
         else:
+            # ── Fall back to pretrained HuggingFace model ─────────────────────
+            print(
+                f"[TextDetector] ⚠️  '{checkpoint}' not found.\n"
+                f"[TextDetector] Loading pretrained fallback: {FALLBACK_MODEL_ID}"
+            )
+            try:
+                self._fallback = hf_pipeline(
+                    "text-classification",
+                    model=FALLBACK_MODEL_ID,
+                    device=0 if torch.cuda.is_available() else -1,
+                    truncation=True,
+                    max_length=512,
+                )
+                print(f"[TextDetector] ✅ Fallback model ready ({FALLBACK_MODEL_ID})")
+            except Exception as e:
+                print(f"[TextDetector] ❌ Fallback model failed to load: {e}")
+                self._fallback = None
+    # ─────────────────────────────────────────────────────��────────────────────
     def predict(self, text: str) -> dict:
         """
         Classify a single text string.
         -------
         dict with keys:
             label      : "AI-Generated" or "Human-Written"
+            confidence : probability of the predicted class (0–1)
             ai_prob    : raw P(AI-generated)
             human_prob : 1 - ai_prob
+            source     : "custom_model" | "pretrained_fallback"
         """
         text = text.strip()
         if not text:
             return {"error": "Input text is empty."}
+        if self._use_custom:
+            return self._predict_custom(text)
+        elif self._fallback is not None:
+            return self._predict_fallback(text)
+        else:
+            return {
+                "error": (
+                    "No model available. Upload 'best_text_detector.pt' to the "
+                    "Space, or check your internet connection so the fallback "
+                    "model can be downloaded."
+                )
+            }
+    # ──────────────────────────────────────────────────────────────────────────
+    def _predict_custom(self, text: str) -> dict:
+        """Run inference with the custom HybridAITextDetector checkpoint."""
         enc = self.tokenizer(
             text,
             truncation=True,
             max_length=MAX_LENGTH,
             return_tensors="pt",
         )
         input_ids      = enc["input_ids"].to(self.device)
         attention_mask = enc["attention_mask"].to(self.device)
         token_type_ids = enc.get(
         ).to(self.device)
         with torch.no_grad():
+            logit   = self.model(input_ids, attention_mask, token_type_ids)
+            ai_prob = torch.sigmoid(logit).item()
         human_prob = 1.0 - ai_prob
         is_ai      = ai_prob >= self.threshold
             "confidence": round(confidence, 4),
             "ai_prob":    round(ai_prob, 4),
             "human_prob": round(human_prob, 4),
+            "source":     "custom_model",
+        }
+    # ──────────────────────────────────────────────────────────────────────────
+    def _predict_fallback(self, text: str) -> dict:
+        """
+        Run inference with the pretrained HuggingFace fallback model.
+        Hello-SimpleAI/chatgpt-detector-roberta returns:
+            {"label": "ChatGPT" | "Human", "score": float}
+        We normalise this to the same dict shape as _predict_custom.
+        """
+        try:
+            raw = self._fallback(text)[0]          # {"label": ..., "score": ...}
+        except Exception as e:
+            return {"error": f"Fallback inference failed: {e}"}
+        hf_label = raw["label"].strip().lower()    # "chatgpt" or "human"
+        score    = float(raw["score"])             # confidence of the returned label
+        if hf_label in ("chatgpt", "ai", "fake", "generated"):
+            ai_prob    = score
+            human_prob = 1.0 - score
+            label      = "AI-Generated"
+        else:
+            human_prob = score
+            ai_prob    = 1.0 - score
+            label      = "Human-Written"
+        is_ai      = ai_prob >= self.threshold
+        label      = "AI-Generated" if is_ai else "Human-Written"
+        confidence = ai_prob if is_ai else human_prob
+        return {
+            "label":      label,
+            "confidence": round(confidence, 4),
+            "ai_prob":    round(ai_prob, 4),
+            "human_prob": round(human_prob, 4),
+            "source":     "pretrained_fallback",
         }
+    # ──────────────────────────────────────────────────────────────────────────
+    def predict_batch(self, texts: list) -> list:
         """Run predict() on a list of texts. Returns list of result dicts."""
         return [self.predict(t) for t in texts]
+    # ─────���────────────────────────────────────────────────────────────────────
+    def format_for_gradio(self, text: str) -> tuple:
         """
+        Convenience wrapper returning Gradio-friendly values:
             (label_string, confidence_float, full_result_dict)
         """
         result = self.predict(text)
         if "error" in result:
             return result["error"], 0.0, result
+        return result["label"], result["confidence"], result