Spaces:
Running
Running
File size: 4,578 Bytes
6f034a7 26de4c9 6f034a7 26de4c9 6f034a7 1140ed3 6f034a7 4fee431 6f034a7 26de4c9 6f034a7 26de4c9 6f034a7 26de4c9 6f034a7 26de4c9 6f034a7 26de4c9 6f034a7 4fee431 6f034a7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
import os
import asyncio
import logging
from io import BytesIO
from fastapi import HTTPException, UploadFile, status, Depends
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
from .inferencer import classify_text
from .preprocess import parse_docx, parse_pdf, parse_txt
import spacy
security = HTTPBearer()
nlp = spacy.load("en_core_web_sm")
# Verify Bearer token from Authorization header
async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
token = credentials.credentials
expected_token = os.getenv("MY_SECRET_TOKEN")
if token != expected_token:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="Invalid or expired token"
)
return token
# Classify plain text input
async def handle_text_analysis(text: str):
text = text.strip()
if not text or len(text.split()) < 10:
raise HTTPException(status_code=400, detail="Text must contain at least 10 words")
if len(text) > 10000:
raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, text)
return {
"result": label,
"perplexity": round(perplexity, 2),
"ai_likelihood": ai_likelihood
}
# Extract text from uploaded files (.docx, .pdf, .txt)
async def extract_file_contents(file: UploadFile) -> str:
content = await file.read()
file_stream = BytesIO(content)
if file.content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
return parse_docx(file_stream)
elif file.content_type == "application/pdf":
return parse_pdf(file_stream)
elif file.content_type == "text/plain":
return parse_txt(file_stream)
else:
raise HTTPException(
status_code=415,
detail="Invalid file type. Only .docx, .pdf and .txt are allowed."
)
# Classify text from uploaded file
async def handle_file_upload(file: UploadFile):
try:
file_contents = await extract_file_contents(file)
if len(file_contents) > 10000:
raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
if not cleaned_text:
raise HTTPException(status_code=404, detail="The file is empty or only contains whitespace.")
label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, cleaned_text)
return {
"content": file_contents,
"result": label,
"perplexity": round(perplexity, 2),
"ai_likelihood": ai_likelihood
}
except Exception as e:
logging.error(f"Error processing file: {e}")
raise HTTPException(status_code=500, detail="Error processing the file")
async def handle_sentence_level_analysis(text: str):
text = text.strip()
if not text.endswith("."):
text += "."
if len(text) > 10000:
raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
doc = nlp(text)
sentences = [sent.text.strip() for sent in doc.sents]
results = []
for sentence in sentences:
if not sentence:
continue
label, perplexity, ai_likelihood = await asyncio.to_thread(classify_text, sentence)
results.append({
"sentence": sentence,
"label": label,
"perplexity": round(perplexity, 2),
"ai_likelihood": ai_likelihood
})
return {"analysis": results}# Analyze each sentence from uploaded file
async def handle_file_sentence(file: UploadFile):
try:
file_contents = await extract_file_contents(file)
if len(file_contents) > 10000:
raise HTTPException(status_code=413, detail="Text must be less than 10,000 characters")
cleaned_text = file_contents.replace("\n", " ").replace("\t", " ").strip()
if not cleaned_text:
raise HTTPException(status_code=404, detail="The file is empty or only contains whitespace.")
result = await handle_sentence_level_analysis(cleaned_text)
return {
"content": file_contents,
**result
}
except Exception as e:
logging.error(f"Error processing file: {e}")
raise HTTPException(status_code=500, detail="Error processing the file")
def classify(text: str):
return classify_text(text)
|