Spaces:

Pujan-Dev
/

AI_API

Sleeping

App Files Files Community

Pujan-Dev commited on May 27, 2025

Commit

183f1c4

1 Parent(s): bc13edc

feat:added nepali language support

Browse files

Files changed (12) hide show

.gitignore +1 -0
README.md +9 -0
app.py +2 -1
features/nepali_text_classifier/__init__.py +0 -0
features/nepali_text_classifier/controller.py +131 -0
features/nepali_text_classifier/inferencer.py +23 -0
features/nepali_text_classifier/model_loader.py +54 -0
features/nepali_text_classifier/preprocess.py +38 -0
features/nepali_text_classifier/routes.py +45 -0
features/text_classifier/controller.py +3 -5
features/text_classifier/model_loader.py +2 -9
requirements.txt +3 -2

.gitignore CHANGED Viewed

@@ -59,3 +59,4 @@ model/
 models/.gitattributes  #<-- This line can stay if you only want to ignore that file, not the whole folder
 todo.md

 models/.gitattributes  #<-- This line can stay if you only want to ignore that file, not the whole folder
 todo.md
+np_text_model

README.md ADDED Viewed

	@@ -0,0 +1,9 @@

+---
+title: Ai-Checker
+emoji: 🚀
+colorFrom: yellow
+colorTo: blue
+sdk: docker
+pinned: false
+---

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ from slowapi.errors import RateLimitExceeded
 from slowapi.util import get_remote_address
 from fastapi.responses import JSONResponse
 from features.text_classifier.routes import router as text_classifier_router
 from config import ACCESS_RATE
 import requests
 limiter = Limiter(key_func=get_remote_address, default_limits=[ACCESS_RATE])
@@ -25,7 +26,7 @@ app.add_middleware(SlowAPIMiddleware)
 # Include your routes
 app.include_router(text_classifier_router, prefix="/text")
 @app.get("/")
 @limiter.limit(ACCESS_RATE)
 async def root(request: Request):

 from slowapi.util import get_remote_address
 from fastapi.responses import JSONResponse
 from features.text_classifier.routes import router as text_classifier_router
+from features.nepali_text_classifier.routes import router as nepali_text_classifier_router
 from config import ACCESS_RATE
 import requests
 limiter = Limiter(key_func=get_remote_address, default_limits=[ACCESS_RATE])
 # Include your routes
 app.include_router(text_classifier_router, prefix="/text")
+app.include_router(nepali_text_classifier_router,prefix="/NP")
 @app.get("/")
 @limiter.limit(ACCESS_RATE)
 async def root(request: Request):

features/nepali_text_classifier/__init__.py ADDED Viewed

File without changes

features/nepali_text_classifier/controller.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import asyncio
+from io import BytesIO
+from fastapi import HTTPException, UploadFile, status, Depends
+from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
+import os
+from features.nepali_text_classifier.inferencer import classify_text
+from  features.nepali_text_classifier.preprocess import *
+import re
+security = HTTPBearer()
+def contains_english(text: str) -> bool:
+    # Remove escape characters
+    cleaned = text.replace("\n", "").replace("\t", "")
+    return bool(re.search(r'[a-zA-Z]', cleaned))
+async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
+    token = credentials.credentials
+    expected_token = os.getenv("MY_SECRET_TOKEN")
+    if token != expected_token:
+        raise HTTPException(
+            status_code=status.HTTP_403_FORBIDDEN,
+            detail="Invalid or expired token"
+        )
+    return token
+async def nepali_text_analysis(text: str):
+    end_symbol_for_NP_text(text)
+    words = text.split()
+    if len(words) < 10:
+        raise HTTPException(status_code=400, detail="Text must contain at least 10 words")
+    if len(text) > 10000:
+        raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
+    result = await asyncio.to_thread(classify_text, text)
+    return result
+#Extract text form uploaded files(.docx,.pdf,.txt)
+async def extract_file_contents(file:UploadFile)-> str:
+    content = await file.read()
+    file_stream = BytesIO(content)
+    if file.content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
+        return parse_docx(file_stream)
+    elif file.content_type =="application/pdf":
+        return parse_pdf(file_stream)
+    elif file.content_type =="text/plain":
+        return parse_txt(file_stream)
+    else:
+        raise HTTPException(status_code=415,detail="Invalid file type. Only .docx,.pdf and .txt are allowed")
+async def handle_file_upload(file: UploadFile):
+    try:
+        file_contents = await extract_file_contents(file)
+        end_symbol_for_NP_text(file_contents)
+        if len(file_contents) > 10000:
+            raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
+        cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
+        if not cleaned_text:
+            raise HTTPException(status_code=404, detail="The file is empty or only contains whitespace.")
+        result = await asyncio.to_thread(classify_text, cleaned_text)
+        return result
+    except Exception as e:
+        logging.error(f"Error processing file: {e}")
+        raise HTTPException(status_code=500, detail="Error processing the file")
+async def handle_sentence_level_analysis(text: str):
+    text = text.strip()
+    if len(text) > 10000:
+        raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
+    end_symbol_for_NP_text(text)
+    # Split text into sentences
+    sentences = [s.strip() + "।" for s in text.split("।") if s.strip()]
+    results = []
+    for sentence in sentences:
+        end_symbol_for_NP_text(sentence)
+        result = await asyncio.to_thread(classify_text, sentence)
+        results.append({
+            "text": sentence,
+            "result": result["label"],
+            "likelihood": result["confidence"]
+        })
+    return {"analysis": results}
+async def handle_file_sentence(file:UploadFile):
+    try:
+        file_contents = await extract_file_contents(file)
+        if len(file_contents) > 10000:
+            raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
+        cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
+        if not cleaned_text:
+            raise HTTPException(status_code=404, detail="The file is empty or only contains whitespace.")
+        # Ensure text ends with danda so last sentence is included
+        # Split text into sentences
+        sentences = [s.strip() + "।" for s in cleaned_text.split("।") if s.strip()]
+        results = []
+        for sentence in sentences:
+            end_symbol_for_NP_text(sentence)
+            result = await asyncio.to_thread(classify_text, sentence)
+            results.append({
+                "text": sentence,
+                "result": result["label"],
+                "likelihood": result["confidence"]
+            })
+        return {"analysis": results}
+    except Exception as e:
+        logging.error(f"Error processing file: {e}")
+        raise HTTPException(status_code=500, detail="Error processing the file")
+def classify(text: str):
+    return classify_text(text)

features/nepali_text_classifier/inferencer.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import torch
+from .model_loader import get_model_tokenizer
+import torch.nn.functional as F
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+def classify_text(text: str):
+    model, tokenizer = get_model_tokenizer()
+    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    with torch.no_grad():
+        outputs = model(**inputs)
+        logits = outputs if isinstance(outputs, torch.Tensor) else outputs.logits
+        probs = F.softmax(logits, dim=1)
+        pred = torch.argmax(probs, dim=1).item()
+        prob_percent = probs[0][pred].item() * 100
+    return {"label": "Human" if pred == 0 else "AI", "confidence": round(prob_percent, 2)}

features/nepali_text_classifier/model_loader.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import os
+import shutil
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import logging
+from huggingface_hub import snapshot_download
+from transformers import AutoTokenizer, AutoModel
+# Configs
+REPO_ID = "Pujan-Dev/Nepali-AI-VS-HUMAN"
+BASE_DIR = "./np_text_model"
+TOKENIZER_DIR = os.path.join(BASE_DIR, "classifier")  # <- update this to match your uploaded folder
+WEIGHTS_PATH = os.path.join(BASE_DIR, "model_95_acc.pth")  # <- change to match actual uploaded weight
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Define model class
+class XLMRClassifier(nn.Module):
+    def __init__(self):
+        super(XLMRClassifier, self).__init__()
+        self.bert = AutoModel.from_pretrained("xlm-roberta-base")
+        self.classifier = nn.Linear(self.bert.config.hidden_size, 2)
+    def forward(self, input_ids, attention_mask):
+        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
+        cls_output = outputs.last_hidden_state[:, 0, :]
+        return self.classifier(cls_output)
+# Globals for caching
+_model = None
+_tokenizer = None
+def download_model_repo():
+    if os.path.exists(BASE_DIR) and os.path.isdir(BASE_DIR):
+        logging.info("Model already downloaded.")
+        return
+    snapshot_path = snapshot_download(repo_id=REPO_ID)
+    os.makedirs(BASE_DIR, exist_ok=True)
+    shutil.copytree(snapshot_path, BASE_DIR, dirs_exist_ok=True)
+def load_model():
+    download_model_repo()
+    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR)
+    model = XLMRClassifier().to(device)
+    model.load_state_dict(torch.load(WEIGHTS_PATH, map_location=device))
+    model.eval()
+    return model, tokenizer
+def get_model_tokenizer():
+    global _model, _tokenizer
+    if _model is None or _tokenizer is None:
+        _model, _tokenizer = load_model()
+    return _model, _tokenizer

features/nepali_text_classifier/preprocess.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import fitz  # PyMuPDF
+import docx
+from io import BytesIO
+import logging
+from fastapi import HTTPException
+def parse_docx(file: BytesIO):
+    doc = docx.Document(file)
+    text = ""
+    for para in doc.paragraphs:
+        text += para.text + "\n"
+    return text
+def parse_pdf(file: BytesIO):
+    try:
+        doc = fitz.open(stream=file, filetype="pdf")
+        text = ""
+        for page_num in range(doc.page_count):
+            page = doc.load_page(page_num)
+            text += page.get_text()
+        return text
+    except Exception as e:
+        logging.error(f"Error while processing PDF: {str(e)}")
+        raise HTTPException(
+            status_code=500, detail="Error processing PDF file")
+def parse_txt(file: BytesIO):
+    return file.read().decode("utf-8")
+def end_symbol_for_NP_text(text):
+        if not text.endswith("।"):
+            text += "।"

features/nepali_text_classifier/routes.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from slowapi import Limiter
+from config import ACCESS_RATE
+from .controller import handle_file_sentence, handle_sentence_level_analysis, nepali_text_analysis
+from .inferencer import classify_text
+from fastapi import APIRouter, File, Request, Depends, HTTPException, UploadFile
+from fastapi.security import HTTPBearer
+from slowapi import Limiter
+from slowapi.util import get_remote_address
+from pydantic import BaseModel
+from .controller import handle_file_upload
+router = APIRouter()
+limiter = Limiter(key_func=get_remote_address)
+security = HTTPBearer()
+# Input schema
+class TextInput(BaseModel):
+    text: str
+@router.post("/analyse")
+@limiter.limit(ACCESS_RATE)
+async def analyse(request: Request, data: TextInput, token: str = Depends(security)):
+    result = classify_text(data.text)
+    return result
+@router.post("/upload")
+@limiter.limit(ACCESS_RATE)
+async def upload_file(request:Request,file:UploadFile=File(...),token:str=Depends(security)):
+    return await handle_file_upload(file)
+@router.post("/analyse-sentences")
+@limiter.limit(ACCESS_RATE)
+async def upload_file(request:Request,data:TextInput,token:str=Depends(security)):
+    return await  handle_sentence_level_analysis(data.text)
+@router.post("/file-sentences-analyse")
+@limiter.limit(ACCESS_RATE)
+async def analyze_sentance_file(request: Request, file: UploadFile = File(...), token: str = Depends(security)):
+    return await handle_file_sentence(file)
+@router.get("/health")
+@limiter.limit(ACCESS_RATE)
+def health(request: Request):
+    return {"status": "ok"}

features/text_classifier/controller.py CHANGED Viewed

@@ -52,7 +52,7 @@ async def extract_file_contents(file: UploadFile) -> str:
     else:
         raise HTTPException(
             status_code=415,
-            detail="Invalid file type. Only .docx, .pdf, and .txt are allowed."
         )
 # Classify text from uploaded file
@@ -60,7 +60,7 @@ async def handle_file_upload(file: UploadFile):
     try:
         file_contents = await extract_file_contents(file)
         if len(file_contents) > 10000:
-            return {"message": "File contains more than 10,000 characters."}
         cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
         if not cleaned_text:
@@ -87,7 +87,6 @@ async def handle_sentence_level_analysis(text: str):
     if len(text) > 10000:
         raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
-    # Use SpaCy for sentence splitting
     doc = nlp(text)
     sentences = [sent.text.strip() for sent in doc.sents]
@@ -108,7 +107,7 @@ async def handle_file_sentence(file: UploadFile):
     try:
         file_contents = await extract_file_contents(file)
         if len(file_contents) > 10000:
-            return {"message": "File contains more than 10,000 characters."}
         cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
         if not cleaned_text:
@@ -123,7 +122,6 @@ async def handle_file_sentence(file: UploadFile):
         logging.error(f"Error processing file: {e}")
         raise HTTPException(status_code=500, detail="Error processing the file")
-# Optional synchronous helper function
 def classify(text: str):
     return classify_text(text)

     else:
         raise HTTPException(
             status_code=415,
+            detail="Invalid file type. Only .docx, .pdf and .txt are allowed."
         )
 # Classify text from uploaded file
     try:
         file_contents = await extract_file_contents(file)
         if len(file_contents) > 10000:
+            raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
         cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
         if not cleaned_text:
     if len(text) > 10000:
         raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
     doc = nlp(text)
     sentences = [sent.text.strip() for sent in doc.sents]
     try:
         file_contents = await extract_file_contents(file)
         if len(file_contents) > 10000:
+            raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
         cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
         if not cleaned_text:
         logging.error(f"Error processing file: {e}")
         raise HTTPException(status_code=500, detail="Error processing the file")
 def classify(text: str):
     return classify_text(text)

features/text_classifier/model_loader.py CHANGED Viewed

@@ -5,8 +5,6 @@ from transformers import GPT2LMHeadModel, GPT2TokenizerFast, GPT2Config
 from huggingface_hub import snapshot_download
 import torch
 from dotenv import load_dotenv
-import spacy
 load_dotenv()
 REPO_ID = "Pujan-Dev/AI-Text-Detector"
 MODEL_DIR = "./models"
@@ -16,18 +14,13 @@ WEIGHTS_PATH = os.path.join(MODEL_DIR, "model_weights.pth")
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 _model, _tokenizer = None, None
 def warmup():
     global _model, _tokenizer
     # Ensure punkt is available
-    try:
-        nlp = spacy.load("en_core_web_sm")
-    except OSError:
-        import subprocess
-        subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
-        nlp = spacy.load("en_core_web_sm")
     download_model_repo()
     _model, _tokenizer = load_model()
 def download_model_repo():

 from huggingface_hub import snapshot_download
 import torch
 from dotenv import load_dotenv
 load_dotenv()
 REPO_ID = "Pujan-Dev/AI-Text-Detector"
 MODEL_DIR = "./models"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 _model, _tokenizer = None, None
 def warmup():
     global _model, _tokenizer
     # Ensure punkt is available
     download_model_repo()
     _model, _tokenizer = load_model()
+    logging.info("Its ready")
 def download_model_repo():

requirements.txt CHANGED Viewed

@@ -7,6 +7,7 @@ python-dotenv
 python-docx
 pydantic
 PyMuPDF
-nltk
 python-multipart
-slowapi

 python-docx
 pydantic
 PyMuPDF
 python-multipart
+slowapi
+spacy
+nltk