Spaces:
Sleeping
Sleeping
Commit
·
9bdc83e
1
Parent(s):
94f2df7
handle both ar and eng
Browse files- __pycache__/app.cpython-313.pyc +0 -0
- app.py +103 -7
__pycache__/app.cpython-313.pyc
CHANGED
Binary files a/__pycache__/app.cpython-313.pyc and b/__pycache__/app.cpython-313.pyc differ
|
|
app.py
CHANGED
@@ -1,16 +1,112 @@
|
|
1 |
-
from fastapi import FastAPI
|
2 |
-
from
|
3 |
-
|
4 |
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
app = FastAPI()
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
class TextInput(BaseModel):
|
11 |
text: str
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
@app.post("/predict")
|
14 |
def predict(input: TextInput):
|
15 |
-
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, HTTPException
|
2 |
+
from pydantic import BaseModel, validator
|
3 |
+
import re
|
4 |
import torch
|
5 |
+
from transformers import pipeline
|
6 |
+
from collections import Counter
|
7 |
+
import logging
|
8 |
+
|
9 |
+
# Configure logging
|
10 |
+
logging.basicConfig(filename="predictions.log", level=logging.INFO, format="%(asctime)s - %(message)s")
|
11 |
|
12 |
app = FastAPI()
|
13 |
+
|
14 |
+
# Enable GPU if available, else use CPU
|
15 |
+
device = 0 if torch.cuda.is_available() else -1
|
16 |
+
torch.manual_seed(42)
|
17 |
+
|
18 |
+
# Load AI detection models
|
19 |
+
english_detector = pipeline("text-classification", model="akshayvkt/detect-ai-text", device=device, truncation=True, max_length=512)
|
20 |
+
arabic_detector = pipeline("text-classification", model="sabaridsnfuji/arabic-ai-text-detector", device=device, truncation=True, max_length=512)
|
21 |
+
|
22 |
+
def detect_language(text: str) -> str:
|
23 |
+
"""Detect if text is Arabic or English based on Unicode character ranges."""
|
24 |
+
# Count Arabic (U+0600–U+06FF) and Latin (U+0041–U+007A) characters
|
25 |
+
arabic_chars = len(re.findall(r'[\u0600-\u06FF]', text))
|
26 |
+
latin_chars = len(re.findall(r'[A-Za-z]', text))
|
27 |
+
total_chars = arabic_chars + latin_chars
|
28 |
+
|
29 |
+
# If no alphabetic characters, default to English
|
30 |
+
if total_chars == 0:
|
31 |
+
return 'en'
|
32 |
+
|
33 |
+
# Classify as Arabic if >50% of alphabetic characters are Arabic
|
34 |
+
arabic_ratio = arabic_chars / total_chars
|
35 |
+
return 'ar' if arabic_ratio > 0.5 else 'en'
|
36 |
+
|
37 |
+
def clean_text(text: str, language: str) -> str:
|
38 |
+
"""Clean text by removing special characters and normalizing spaces. Skip lowercase for Arabic."""
|
39 |
+
text = re.sub(r'\s+', ' ', text)
|
40 |
+
text = re.sub(r'[^\w\s.,!?]', '', text)
|
41 |
+
text = text.strip()
|
42 |
+
if language == 'en':
|
43 |
+
text = text.lower() # Lowercase only for English
|
44 |
+
return text
|
45 |
+
|
46 |
+
def split_text(text: str, max_chars: int = 5000) -> list:
|
47 |
+
"""Split text into chunks of max_chars, preserving sentence boundaries."""
|
48 |
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
49 |
+
chunks = []
|
50 |
+
current_chunk = ""
|
51 |
+
for sentence in sentences:
|
52 |
+
if len(current_chunk) + len(sentence) <= max_chars:
|
53 |
+
current_chunk += sentence + " "
|
54 |
+
else:
|
55 |
+
if current_chunk:
|
56 |
+
chunks.append(current_chunk.strip())
|
57 |
+
current_chunk = sentence + " "
|
58 |
+
if current_chunk:
|
59 |
+
chunks.append(current_chunk.strip())
|
60 |
+
return chunks
|
61 |
|
62 |
class TextInput(BaseModel):
|
63 |
text: str
|
64 |
|
65 |
+
@validator("text")
|
66 |
+
def validate_text(cls, value):
|
67 |
+
"""Validate input text for minimum length and content."""
|
68 |
+
word_count = len(value.split())
|
69 |
+
if word_count < 50:
|
70 |
+
raise ValueError(f"Text too short ({word_count} words). Minimum 50 words required.")
|
71 |
+
if not re.search(r'[\w]', value):
|
72 |
+
raise ValueError("Text must contain alphabetic characters.")
|
73 |
+
return value
|
74 |
+
|
75 |
@app.post("/predict")
|
76 |
def predict(input: TextInput):
|
77 |
+
# Detect language
|
78 |
+
detected_lang = detect_language(input.text)
|
79 |
+
detector = arabic_detector if detected_lang == 'ar' else english_detector
|
80 |
+
note_lang = f"Detected language: {'Arabic' if detected_lang == 'ar' else 'English'}"
|
81 |
+
|
82 |
+
# Clean text based on detected language
|
83 |
+
cleaned_text = clean_text(input.text, detected_lang)
|
84 |
+
|
85 |
+
if len(cleaned_text) > 10000:
|
86 |
+
# Split into chunks for texts > 10,000 characters
|
87 |
+
chunks = split_text(cleaned_text, max_chars=5000)
|
88 |
+
# Batch process chunks
|
89 |
+
results = detector(chunks, truncation=True, max_length=512)
|
90 |
+
# Apply custom threshold (0.7) for classification
|
91 |
+
labels = ["AI" if res['score'] >= 0.7 else "Human" for res in results]
|
92 |
+
label_counts = Counter(labels)
|
93 |
+
final_label = label_counts.most_common(1)[0][0]
|
94 |
+
# Average score for the winning label
|
95 |
+
scores = [res['score'] for res, label in zip(results, labels) if label == final_label]
|
96 |
+
avg_score = sum(scores) / len(scores) if scores else 0.0
|
97 |
+
# Log prediction details
|
98 |
+
logging.info(f"Language: {detected_lang} | Text Length: {len(cleaned_text)} | Chunks: {len(chunks)} | Prediction: {final_label} | Score: {avg_score}")
|
99 |
+
return {
|
100 |
+
"prediction": final_label,
|
101 |
+
"score": avg_score,
|
102 |
+
"note": f"{note_lang}. Text was split into {len(chunks)} chunks due to length > 10,000 characters.",
|
103 |
+
"chunk_results": [{"chunk": chunk[:50] + "...", "label": res['label'], "score": res['score']} for chunk, res in zip(chunks, results)]
|
104 |
+
}
|
105 |
+
else:
|
106 |
+
result = detector(cleaned_text, truncation=True, max_length=512)
|
107 |
+
score = result[0]['score']
|
108 |
+
label = "AI" if score >= 0.7 else "Human"
|
109 |
+
note = f"{note_lang}. Warning: Close to threshold (0.7)" if 0.65 <= score < 0.75 else note_lang
|
110 |
+
# Log prediction details
|
111 |
+
logging.info(f"Language: {detected_lang} | Text Length: {len(cleaned_text)} | Prediction: {label} | Score: {score} | Note: {note}")
|
112 |
+
return {"prediction": label, "score": score, "note": note}
|