import os
import io
import fitz 
import requests
from docx import Document
from pptx import Presentation
import pytesseract
from PIL import Image

import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

from fastapi import FastAPI
from pydantic import BaseModel
from typing import List

os.environ["HF_HOME"] = "/tmp/huggingface_cache"

app = FastAPI()

def clean_text(text: str) -> str:
    return " ".join(text.split())

def extract_text_from_pdf_bytes(file_bytes: bytes) -> str:
    doc = fitz.open(stream=file_bytes, filetype="pdf")
    text = " ".join(page.get_text() for page in doc)
    return clean_text(text)

def extract_text_from_doc_bytes(file_bytes: bytes) -> str:
    doc = Document(io.BytesIO(file_bytes))
    text = " ".join(para.text for para in doc.paragraphs)
    return clean_text(text)

def extract_text_from_ppt_bytes(file_bytes: bytes) -> str:
    prs = Presentation(io.BytesIO(file_bytes))
    text = " ".join(
        shape.text
        for slide in prs.slides
        for shape in slide.shapes
        if hasattr(shape, "text")
    )
    return clean_text(text)

def extract_text_from_image_bytes(file_bytes: bytes) -> str:
    pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"

    image = Image.open(io.BytesIO(file_bytes))
    text = pytesseract.image_to_string(image)
    return clean_text(text)


def download_file(url: str) -> bytes:
    response = requests.get(url)
    response.raise_for_status()
    return response.content

@app.get("/")
def home():
    return {"msg": "hello ml server is deployed"}

@app.post("/extract-text")
async def extract_text(file_urls: List[str]):
    extracted_texts = {}
    for url in file_urls:
        try:
            file_bytes = download_file(url)
            ext = os.path.splitext(url.split("?")[0])[1].lower()

            if ext in [".pdf"]:
                text = extract_text_from_pdf_bytes(file_bytes)
            elif ext in [".doc", ".docx"]:
                text = extract_text_from_doc_bytes(file_bytes)
            elif ext in [".ppt", ".pptx"]:
                text = extract_text_from_ppt_bytes(file_bytes)
            elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp", ".pbm", ".pgm", ".ppm",".gif", ".jp2", ".pcx", ".pnm"]:
                text = extract_text_from_image_bytes(file_bytes)
            else:
                text = "Unsupported file format."
        except Exception as e:
            text = f"Error processing file: {str(e)}"

        extracted_texts[url] = text
    return extracted_texts


model = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2", cache_folder="/tmp/huggingface_cache")

class AnswerRequest(BaseModel):
    answers: List[str]

@app.post("/detect-plagiarism")
async def detect_plagiarism(request: AnswerRequest):
    student_answers = request.answers

    embeddings = model.encode(student_answers, convert_to_tensor=False)
    embeddings = np.array(embeddings).astype("float32")

    faiss.normalize_L2(embeddings)

    embedding_dim = embeddings.shape[1]
    index = faiss.IndexFlatIP(embedding_dim)
    index.add(embeddings)

    similarity_threshold = 0.90
    k = 5

    student_plagiarism_flags = [0] * len(embeddings)

    for i in range(len(embeddings)):
        distances, indices = index.search(embeddings[i:i+1], k+1)
        for j, dist in zip(indices[0], distances[0]):
            if i == j:
                continue
            if dist >= similarity_threshold:
                student_plagiarism_flags[i] = 1
                student_plagiarism_flags[j] = 1

    return {"plagiarism_flags": student_plagiarism_flags}