import os import io import fitz import requests from docx import Document from pptx import Presentation import pytesseract from PIL import Image import numpy as np import faiss from sentence_transformers import SentenceTransformer from fastapi import FastAPI from pydantic import BaseModel from typing import List os.environ["HF_HOME"] = "/tmp/huggingface_cache" app = FastAPI() def clean_text(text: str) -> str: return " ".join(text.split()) def extract_text_from_pdf_bytes(file_bytes: bytes) -> str: doc = fitz.open(stream=file_bytes, filetype="pdf") text = " ".join(page.get_text() for page in doc) return clean_text(text) def extract_text_from_doc_bytes(file_bytes: bytes) -> str: doc = Document(io.BytesIO(file_bytes)) text = " ".join(para.text for para in doc.paragraphs) return clean_text(text) def extract_text_from_ppt_bytes(file_bytes: bytes) -> str: prs = Presentation(io.BytesIO(file_bytes)) text = " ".join( shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text") ) return clean_text(text) def extract_text_from_image_bytes(file_bytes: bytes) -> str: pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract" image = Image.open(io.BytesIO(file_bytes)) text = pytesseract.image_to_string(image) return clean_text(text) def download_file(url: str) -> bytes: response = requests.get(url) response.raise_for_status() return response.content @app.get("/") def home(): return {"msg": "hello ml server is deployed"} @app.post("/extract-text") async def extract_text(file_urls: List[str]): extracted_texts = {} for url in file_urls: try: file_bytes = download_file(url) ext = os.path.splitext(url.split("?")[0])[1].lower() if ext in [".pdf"]: text = extract_text_from_pdf_bytes(file_bytes) elif ext in [".doc", ".docx"]: text = extract_text_from_doc_bytes(file_bytes) elif ext in [".ppt", ".pptx"]: text = extract_text_from_ppt_bytes(file_bytes) elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp", ".pbm", ".pgm", ".ppm",".gif", ".jp2", ".pcx", ".pnm"]: text = extract_text_from_image_bytes(file_bytes) else: text = "Unsupported file format." except Exception as e: text = f"Error processing file: {str(e)}" extracted_texts[url] = text return extracted_texts model = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2", cache_folder="/tmp/huggingface_cache") class AnswerRequest(BaseModel): answers: List[str] @app.post("/detect-plagiarism") async def detect_plagiarism(request: AnswerRequest): student_answers = request.answers embeddings = model.encode(student_answers, convert_to_tensor=False) embeddings = np.array(embeddings).astype("float32") faiss.normalize_L2(embeddings) embedding_dim = embeddings.shape[1] index = faiss.IndexFlatIP(embedding_dim) index.add(embeddings) similarity_threshold = 0.90 k = 5 student_plagiarism_flags = [0] * len(embeddings) for i in range(len(embeddings)): distances, indices = index.search(embeddings[i:i+1], k+1) for j, dist in zip(indices[0], distances[0]): if i == j: continue if dist >= similarity_threshold: student_plagiarism_flags[i] = 1 student_plagiarism_flags[j] = 1 return {"plagiarism_flags": student_plagiarism_flags}