Spaces:

iyadalagha
/

ai-text-detector-app

Sleeping

App Files Files Community

iyadalagha commited on Aug 27

Commit

9bdc83e

1 Parent(s): 94f2df7

handle both ar and eng

Browse files

Files changed (2) hide show

__pycache__/app.cpython-313.pyc +0 -0
app.py +103 -7

__pycache__/app.cpython-313.pyc CHANGED Viewed

Binary files a/__pycache__/app.cpython-313.pyc and b/__pycache__/app.cpython-313.pyc differ

app.py CHANGED Viewed

@@ -1,16 +1,112 @@
-from fastapi import FastAPI
-from transformers import pipeline
-from pydantic import BaseModel
 import torch
 app = FastAPI()
-torch.manual_seed(42)  # For reproducibility
-detector = pipeline("text-classification", model="akshayvkt/detect-ai-text")
 class TextInput(BaseModel):
     text: str
 @app.post("/predict")
 def predict(input: TextInput):
-    result = detector(input.text)
-    return {"prediction": result[0]['label'], "score": result[0]['score']}

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel, validator
+import re
 import torch
+from transformers import pipeline
+from collections import Counter
+import logging
+# Configure logging
+logging.basicConfig(filename="predictions.log", level=logging.INFO, format="%(asctime)s - %(message)s")
 app = FastAPI()
+# Enable GPU if available, else use CPU
+device = 0 if torch.cuda.is_available() else -1
+torch.manual_seed(42)
+# Load AI detection models
+english_detector = pipeline("text-classification", model="akshayvkt/detect-ai-text", device=device, truncation=True, max_length=512)
+arabic_detector = pipeline("text-classification", model="sabaridsnfuji/arabic-ai-text-detector", device=device, truncation=True, max_length=512)
+def detect_language(text: str) -> str:
+    """Detect if text is Arabic or English based on Unicode character ranges."""
+    # Count Arabic (U+0600–U+06FF) and Latin (U+0041–U+007A) characters
+    arabic_chars = len(re.findall(r'[\u0600-\u06FF]', text))
+    latin_chars = len(re.findall(r'[A-Za-z]', text))
+    total_chars = arabic_chars + latin_chars
+    # If no alphabetic characters, default to English
+    if total_chars == 0:
+        return 'en'
+    # Classify as Arabic if >50% of alphabetic characters are Arabic
+    arabic_ratio = arabic_chars / total_chars
+    return 'ar' if arabic_ratio > 0.5 else 'en'
+def clean_text(text: str, language: str) -> str:
+    """Clean text by removing special characters and normalizing spaces. Skip lowercase for Arabic."""
+    text = re.sub(r'\s+', ' ', text)
+    text = re.sub(r'[^\w\s.,!?]', '', text)
+    text = text.strip()
+    if language == 'en':
+        text = text.lower()  # Lowercase only for English
+    return text
+def split_text(text: str, max_chars: int = 5000) -> list:
+    """Split text into chunks of max_chars, preserving sentence boundaries."""
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    chunks = []
+    current_chunk = ""
+    for sentence in sentences:
+        if len(current_chunk) + len(sentence) <= max_chars:
+            current_chunk += sentence + " "
+        else:
+            if current_chunk:
+                chunks.append(current_chunk.strip())
+            current_chunk = sentence + " "
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    return chunks
 class TextInput(BaseModel):
     text: str
+    @validator("text")
+    def validate_text(cls, value):
+        """Validate input text for minimum length and content."""
+        word_count = len(value.split())
+        if word_count < 50:
+            raise ValueError(f"Text too short ({word_count} words). Minimum 50 words required.")
+        if not re.search(r'[\w]', value):
+            raise ValueError("Text must contain alphabetic characters.")
+        return value
 @app.post("/predict")
 def predict(input: TextInput):
+    # Detect language
+    detected_lang = detect_language(input.text)
+    detector = arabic_detector if detected_lang == 'ar' else english_detector
+    note_lang = f"Detected language: {'Arabic' if detected_lang == 'ar' else 'English'}"
+    # Clean text based on detected language
+    cleaned_text = clean_text(input.text, detected_lang)
+    if len(cleaned_text) > 10000:
+        # Split into chunks for texts > 10,000 characters
+        chunks = split_text(cleaned_text, max_chars=5000)
+        # Batch process chunks
+        results = detector(chunks, truncation=True, max_length=512)
+        # Apply custom threshold (0.7) for classification
+        labels = ["AI" if res['score'] >= 0.7 else "Human" for res in results]
+        label_counts = Counter(labels)
+        final_label = label_counts.most_common(1)[0][0]
+        # Average score for the winning label
+        scores = [res['score'] for res, label in zip(results, labels) if label == final_label]
+        avg_score = sum(scores) / len(scores) if scores else 0.0
+        # Log prediction details
+        logging.info(f"Language: {detected_lang} | Text Length: {len(cleaned_text)} | Chunks: {len(chunks)} | Prediction: {final_label} | Score: {avg_score}")
+        return {
+            "prediction": final_label,
+            "score": avg_score,
+            "note": f"{note_lang}. Text was split into {len(chunks)} chunks due to length > 10,000 characters.",
+            "chunk_results": [{"chunk": chunk[:50] + "...", "label": res['label'], "score": res['score']} for chunk, res in zip(chunks, results)]
+        }
+    else:
+        result = detector(cleaned_text, truncation=True, max_length=512)
+        score = result[0]['score']
+        label = "AI" if score >= 0.7 else "Human"
+        note = f"{note_lang}. Warning: Close to threshold (0.7)" if 0.65 <= score < 0.75 else note_lang
+        # Log prediction details
+        logging.info(f"Language: {detected_lang} | Text Length: {len(cleaned_text)} | Prediction: {label} | Score: {score} | Note: {note}")
+        return {"prediction": label, "score": score, "note": note}