iyadalagha commited on
Commit
9bdc83e
·
1 Parent(s): 94f2df7

handle both ar and eng

Browse files
Files changed (2) hide show
  1. __pycache__/app.cpython-313.pyc +0 -0
  2. app.py +103 -7
__pycache__/app.cpython-313.pyc CHANGED
Binary files a/__pycache__/app.cpython-313.pyc and b/__pycache__/app.cpython-313.pyc differ
 
app.py CHANGED
@@ -1,16 +1,112 @@
1
- from fastapi import FastAPI
2
- from transformers import pipeline
3
- from pydantic import BaseModel
4
  import torch
 
 
 
 
 
 
5
 
6
  app = FastAPI()
7
- torch.manual_seed(42) # For reproducibility
8
- detector = pipeline("text-classification", model="akshayvkt/detect-ai-text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  class TextInput(BaseModel):
11
  text: str
12
 
 
 
 
 
 
 
 
 
 
 
13
  @app.post("/predict")
14
  def predict(input: TextInput):
15
- result = detector(input.text)
16
- return {"prediction": result[0]['label'], "score": result[0]['score']}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel, validator
3
+ import re
4
  import torch
5
+ from transformers import pipeline
6
+ from collections import Counter
7
+ import logging
8
+
9
+ # Configure logging
10
+ logging.basicConfig(filename="predictions.log", level=logging.INFO, format="%(asctime)s - %(message)s")
11
 
12
  app = FastAPI()
13
+
14
+ # Enable GPU if available, else use CPU
15
+ device = 0 if torch.cuda.is_available() else -1
16
+ torch.manual_seed(42)
17
+
18
+ # Load AI detection models
19
+ english_detector = pipeline("text-classification", model="akshayvkt/detect-ai-text", device=device, truncation=True, max_length=512)
20
+ arabic_detector = pipeline("text-classification", model="sabaridsnfuji/arabic-ai-text-detector", device=device, truncation=True, max_length=512)
21
+
22
+ def detect_language(text: str) -> str:
23
+ """Detect if text is Arabic or English based on Unicode character ranges."""
24
+ # Count Arabic (U+0600–U+06FF) and Latin (U+0041–U+007A) characters
25
+ arabic_chars = len(re.findall(r'[\u0600-\u06FF]', text))
26
+ latin_chars = len(re.findall(r'[A-Za-z]', text))
27
+ total_chars = arabic_chars + latin_chars
28
+
29
+ # If no alphabetic characters, default to English
30
+ if total_chars == 0:
31
+ return 'en'
32
+
33
+ # Classify as Arabic if >50% of alphabetic characters are Arabic
34
+ arabic_ratio = arabic_chars / total_chars
35
+ return 'ar' if arabic_ratio > 0.5 else 'en'
36
+
37
+ def clean_text(text: str, language: str) -> str:
38
+ """Clean text by removing special characters and normalizing spaces. Skip lowercase for Arabic."""
39
+ text = re.sub(r'\s+', ' ', text)
40
+ text = re.sub(r'[^\w\s.,!?]', '', text)
41
+ text = text.strip()
42
+ if language == 'en':
43
+ text = text.lower() # Lowercase only for English
44
+ return text
45
+
46
+ def split_text(text: str, max_chars: int = 5000) -> list:
47
+ """Split text into chunks of max_chars, preserving sentence boundaries."""
48
+ sentences = re.split(r'(?<=[.!?])\s+', text)
49
+ chunks = []
50
+ current_chunk = ""
51
+ for sentence in sentences:
52
+ if len(current_chunk) + len(sentence) <= max_chars:
53
+ current_chunk += sentence + " "
54
+ else:
55
+ if current_chunk:
56
+ chunks.append(current_chunk.strip())
57
+ current_chunk = sentence + " "
58
+ if current_chunk:
59
+ chunks.append(current_chunk.strip())
60
+ return chunks
61
 
62
  class TextInput(BaseModel):
63
  text: str
64
 
65
+ @validator("text")
66
+ def validate_text(cls, value):
67
+ """Validate input text for minimum length and content."""
68
+ word_count = len(value.split())
69
+ if word_count < 50:
70
+ raise ValueError(f"Text too short ({word_count} words). Minimum 50 words required.")
71
+ if not re.search(r'[\w]', value):
72
+ raise ValueError("Text must contain alphabetic characters.")
73
+ return value
74
+
75
  @app.post("/predict")
76
  def predict(input: TextInput):
77
+ # Detect language
78
+ detected_lang = detect_language(input.text)
79
+ detector = arabic_detector if detected_lang == 'ar' else english_detector
80
+ note_lang = f"Detected language: {'Arabic' if detected_lang == 'ar' else 'English'}"
81
+
82
+ # Clean text based on detected language
83
+ cleaned_text = clean_text(input.text, detected_lang)
84
+
85
+ if len(cleaned_text) > 10000:
86
+ # Split into chunks for texts > 10,000 characters
87
+ chunks = split_text(cleaned_text, max_chars=5000)
88
+ # Batch process chunks
89
+ results = detector(chunks, truncation=True, max_length=512)
90
+ # Apply custom threshold (0.7) for classification
91
+ labels = ["AI" if res['score'] >= 0.7 else "Human" for res in results]
92
+ label_counts = Counter(labels)
93
+ final_label = label_counts.most_common(1)[0][0]
94
+ # Average score for the winning label
95
+ scores = [res['score'] for res, label in zip(results, labels) if label == final_label]
96
+ avg_score = sum(scores) / len(scores) if scores else 0.0
97
+ # Log prediction details
98
+ logging.info(f"Language: {detected_lang} | Text Length: {len(cleaned_text)} | Chunks: {len(chunks)} | Prediction: {final_label} | Score: {avg_score}")
99
+ return {
100
+ "prediction": final_label,
101
+ "score": avg_score,
102
+ "note": f"{note_lang}. Text was split into {len(chunks)} chunks due to length > 10,000 characters.",
103
+ "chunk_results": [{"chunk": chunk[:50] + "...", "label": res['label'], "score": res['score']} for chunk, res in zip(chunks, results)]
104
+ }
105
+ else:
106
+ result = detector(cleaned_text, truncation=True, max_length=512)
107
+ score = result[0]['score']
108
+ label = "AI" if score >= 0.7 else "Human"
109
+ note = f"{note_lang}. Warning: Close to threshold (0.7)" if 0.65 <= score < 0.75 else note_lang
110
+ # Log prediction details
111
+ logging.info(f"Language: {detected_lang} | Text Length: {len(cleaned_text)} | Prediction: {label} | Score: {score} | Note: {note}")
112
+ return {"prediction": label, "score": score, "note": note}