Pujan-Dev commited on
Commit
bc13edc
·
1 Parent(s): b247b11

fix: fixed the server err

Browse files
Dockerfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.9
5
+
6
+ RUN useradd -m -u 1000 user
7
+ USER user
8
+ ENV PATH="/home/user/.local/bin:$PATH"
9
+
10
+ WORKDIR /app
11
+
12
+ COPY --chown=user ./requirements.txt requirements.txt
13
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
+ RUN python -m spacy download en_core_web_sm || echo "Failed to download model"
15
+
16
+ COPY --chown=user . /app
17
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
features/text_classifier/controller.py CHANGED
@@ -5,12 +5,12 @@ from io import BytesIO
5
 
6
  from fastapi import HTTPException, UploadFile, status, Depends
7
  from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
8
- from nltk.tokenize import sent_tokenize
9
 
10
  from .inferencer import classify_text
11
  from .preprocess import parse_docx, parse_pdf, parse_txt
12
-
13
  security = HTTPBearer()
 
14
 
15
  # Verify Bearer token from Authorization header
16
  async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
@@ -77,18 +77,23 @@ async def handle_file_upload(file: UploadFile):
77
  logging.error(f"Error processing file: {e}")
78
  raise HTTPException(status_code=500, detail="Error processing the file")
79
 
80
- # Analyze each sentence in plain text input
 
81
  async def handle_sentence_level_analysis(text: str):
82
  text = text.strip()
83
- if text[-1] != ".":
84
- text+="."
 
85
  if len(text) > 10000:
86
  raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
87
-
88
- sentences = sent_tokenize(text, language="english")
 
 
 
89
  results = []
90
  for sentence in sentences:
91
- if not sentence.strip():
92
  continue
93
  label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, sentence)
94
  results.append({
@@ -97,9 +102,8 @@ async def handle_sentence_level_analysis(text: str):
97
  "perplexity": round(perplexity, 2),
98
  "ai_likelihood": ai_likelihood
99
  })
100
- return {"analysis": results}
101
 
102
- # Analyze each sentence from uploaded file
103
  async def handle_file_sentence(file: UploadFile):
104
  try:
105
  file_contents = await extract_file_contents(file)
 
5
 
6
  from fastapi import HTTPException, UploadFile, status, Depends
7
  from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
 
8
 
9
  from .inferencer import classify_text
10
  from .preprocess import parse_docx, parse_pdf, parse_txt
11
+ import spacy
12
  security = HTTPBearer()
13
+ nlp = spacy.load("en_core_web_sm")
14
 
15
  # Verify Bearer token from Authorization header
16
  async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
 
77
  logging.error(f"Error processing file: {e}")
78
  raise HTTPException(status_code=500, detail="Error processing the file")
79
 
80
+
81
+
82
  async def handle_sentence_level_analysis(text: str):
83
  text = text.strip()
84
+ if not text.endswith("."):
85
+ text += "."
86
+
87
  if len(text) > 10000:
88
  raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
89
+
90
+ # Use SpaCy for sentence splitting
91
+ doc = nlp(text)
92
+ sentences = [sent.text.strip() for sent in doc.sents]
93
+
94
  results = []
95
  for sentence in sentences:
96
+ if not sentence:
97
  continue
98
  label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, sentence)
99
  results.append({
 
102
  "perplexity": round(perplexity, 2),
103
  "ai_likelihood": ai_likelihood
104
  })
 
105
 
106
+ return {"analysis": results}# Analyze each sentence from uploaded file
107
  async def handle_file_sentence(file: UploadFile):
108
  try:
109
  file_contents = await extract_file_contents(file)
features/text_classifier/model_loader.py CHANGED
@@ -5,7 +5,8 @@ from transformers import GPT2LMHeadModel, GPT2TokenizerFast, GPT2Config
5
  from huggingface_hub import snapshot_download
6
  import torch
7
  from dotenv import load_dotenv
8
- import nltk
 
9
  load_dotenv()
10
  REPO_ID = "Pujan-Dev/AI-Text-Detector"
11
  MODEL_DIR = "./models"
@@ -15,17 +16,18 @@ WEIGHTS_PATH = os.path.join(MODEL_DIR, "model_weights.pth")
15
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
  _model, _tokenizer = None, None
17
 
18
-
19
  def warmup():
20
  global _model, _tokenizer
21
  # Ensure punkt is available
22
- nltk.download("punkt")
23
-
24
- nltk.download('punkt_tab')
 
 
 
25
 
26
  download_model_repo()
27
  _model, _tokenizer = load_model()
28
- logging.info("Its ready")
29
 
30
 
31
  def download_model_repo():
 
5
  from huggingface_hub import snapshot_download
6
  import torch
7
  from dotenv import load_dotenv
8
+ import spacy
9
+
10
  load_dotenv()
11
  REPO_ID = "Pujan-Dev/AI-Text-Detector"
12
  MODEL_DIR = "./models"
 
16
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17
  _model, _tokenizer = None, None
18
 
 
19
  def warmup():
20
  global _model, _tokenizer
21
  # Ensure punkt is available
22
+ try:
23
+ nlp = spacy.load("en_core_web_sm")
24
+ except OSError:
25
+ import subprocess
26
+ subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
27
+ nlp = spacy.load("en_core_web_sm")
28
 
29
  download_model_repo()
30
  _model, _tokenizer = load_model()
 
31
 
32
 
33
  def download_model_repo():