import os
import re
import time
import logging
from pathlib import Path
from typing import List, Tuple

from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.middleware.cors import CORSMiddleware

import fitz  # PyMuPDF
import torch
import numpy as np
import nltk
import asyncio

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModel,
)
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity

# Setup cache
os.environ["TRANSFORMERS_CACHE"] = "/tmp/.cache/huggingface"
os.environ["HF_HOME"] = "/tmp/.cache/huggingface"
os.environ["NLTK_DATA"] = "/tmp/.cache/nltk"
Path("/tmp/.cache/huggingface").mkdir(parents=True, exist_ok=True)
Path("/tmp/.cache/nltk").mkdir(parents=True, exist_ok=True)

# Logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# NLTK init
try:
    nltk.data.path.append("/tmp/.cache/nltk")
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt", download_dir="/tmp/.cache/nltk")
    nltk.data.path.append("/tmp/.cache/nltk")

# App init
app = FastAPI()
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["POST"],
    allow_headers=["*"],
)

# Model configs
MODEL_NAME = "Essay-Grader/roberta-ai-detector-20250401_232702" 
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
DEVICE = 0 if torch.cuda.is_available() else -1
MAX_TEXT_LENGTH = 10000
AI_CHUNK_SIZE = 512
PLAGIARISM_THRESHOLD = 0.75
TIMEOUT = 25  # total timeout buffer

# Load models
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
ai_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(
    DEVICE if DEVICE != -1 else "cpu"
)
ai_model.eval()

embed_tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL)
embed_model = AutoModel.from_pretrained(EMBEDDING_MODEL).to(
    DEVICE if DEVICE != -1 else "cpu"
)
embed_model.eval()

# Health check
# @app.get("/health")
# def health_check():
#     return {"status": "healthy"}


def extract_text(pdf_bytes: bytes) -> str:
    try:
        with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
            text = []
            for page in doc:
                page_text = page.get_text().strip()
                if "reference" in page_text.lower():
                    break  # Exclude reference section
                text.append(page_text)

            full_text = re.sub(r"\s+", " ", "\n".join(text))[:MAX_TEXT_LENGTH]
            if len(full_text) < 150:
                raise ValueError("Text too short")
            return full_text
    except Exception as e:
        logger.error(f"PDF error: {str(e)}")
        raise HTTPException(400, "Invalid PDF")


def predict_ai(text: str) -> float:
    inputs = tokenizer(
        text,
        truncation=True,
        max_length=AI_CHUNK_SIZE,
        return_tensors="pt",
    ).to(ai_model.device)

    with torch.no_grad():
        outputs = ai_model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)
        return float(probs[0][1])  # AI-generated probability


def compute_embeddings(sentences: List[str]) -> np.ndarray:
    inputs = embed_tokenizer(
        sentences,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt",
    ).to(embed_model.device)

    with torch.no_grad():
        outputs = embed_model(**inputs)

    attention_mask = inputs["attention_mask"]
    last_hidden = outputs.last_hidden_state
    return (last_hidden * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(
        1, keepdim=True
    )


def check_plagiarism(text: str) -> Tuple[float, bool]:
    try:
        sentences = [
            s for s in sent_tokenize(text) if 5 < len(s.split()) < 100
        ][:40]  # limit
        if len(sentences) < 2:
            return 0.0, False

        embeddings = compute_embeddings(sentences).cpu().numpy()
        sim_matrix = cosine_similarity(embeddings)
        np.fill_diagonal(sim_matrix, 0)

        n = len(sim_matrix)
        top_k = max(1, int(0.1 * n * (n - 1) / 2))
        top_indices = np.argpartition(sim_matrix.flatten(), -top_k)[-top_k:]
        avg_similarity = float(np.mean(sim_matrix.flatten()[top_indices]))

        return round(avg_similarity * 100, 2), avg_similarity > PLAGIARISM_THRESHOLD
    except Exception as e:
        logger.error(f"Plagiarism check error: {str(e)}")
        return 0.0, False


@app.post("/detect")
async def detect_ai_and_plagiarism(file: UploadFile = File(...)):
    start_time = time.time()

    try:
        if not file.filename.lower().endswith(".pdf"):
            raise HTTPException(400, "Only PDF files allowed")

        pdf_data = await file.read()
        text = extract_text(pdf_data)

        async def run_ai():
            return predict_ai(text)

        async def run_plagiarism():
            return check_plagiarism(text)

        ai_future = asyncio.create_task(run_ai())
        plagiarism_future = asyncio.create_task(run_plagiarism())

        ai_score, (plag_score, plag_risk) = await asyncio.gather(
            ai_future, plagiarism_future
        )

        total_time = time.time() - start_time
        if total_time > TIMEOUT:
            raise HTTPException(500, "Processing timed out")

        return {
            "ai_generated_percentage": round(ai_score * 100, 2),
            "plagiarism_percentage": plag_score,
            # "plagiarism_risk": plag_risk
        }

    except Exception as e:
        logger.error(f"Error: {str(e)}", exc_info=True)
        raise HTTPException(500, f"Processing failed: {str(e)}")


# # main.py: Optimized AI Detection and Plagiarism Check API

# import os
# from pathlib import Path
# import logging
# from typing import List, Tuple
# import re
# import time

# # Configure cache directories first
# os.environ["TRANSFORMERS_CACHE"] = "/tmp/.cache/huggingface"
# os.environ["HF_HOME"] = "/tmp/.cache/huggingface"
# os.environ["NLTK_DATA"] = "/tmp/.cache/nltk"

# # Create cache directories
# Path("/tmp/.cache/huggingface").mkdir(parents=True, exist_ok=True)
# Path("/tmp/.cache/nltk").mkdir(parents=True, exist_ok=True)

# # Import remaining dependencies
# from fastapi import FastAPI, UploadFile, File, HTTPException
# from fastapi.middleware.cors import CORSMiddleware
# from transformers import pipeline, AutoTokenizer, AutoModel
# import fitz  # PyMuPDF
# import torch
# import numpy as np
# import nltk
# from nltk.tokenize import sent_tokenize
# from sklearn.metrics.pairwise import cosine_similarity

# # Configure logging
# logging.basicConfig(level=logging.INFO)
# logger = logging.getLogger(__name__)

# # Initialize NLTK data
# try:
#     nltk.data.path.append("/tmp/.cache/nltk")
#     nltk.data.find('tokenizers/punkt')
#     logger.info("NLTK punkt tokenizer available")
# except LookupError:
#     logger.info("Downloading NLTK punkt tokenizer...")
#     nltk.download('punkt', download_dir="/tmp/.cache/nltk")
#     nltk.data.path.append("/tmp/.cache/nltk")

# app = FastAPI()

# app.add_middleware(
#     CORSMiddleware,
#     allow_origins=["*"],
#     allow_methods=["POST"],
#     allow_headers=["*"],
# )

# # Configuration - Optimized for speed and accuracy
# # https://huggingface.co/Essay-Grader/roberta-ai-detector-20250401_232702
# # roberta-base-openai-detector

# MODEL_NAME = "Essay-Grader/roberta-ai-detector-20250401_232702"  # More accurate model
# EMBEDDING_MODEL = "sentence-transformers/paraphrase-MiniLM-L3-v2"  # Faster model
# DEVICE = 0 if torch.cuda.is_available() else -1
# CHUNK_SIZE = 768  # Increased chunk size for better performance
# MIN_TEXT_LENGTH = 150
# MAX_TEXT_LENGTH = 8000  # Reduced for faster processing
# PLAGIARISM_THRESHOLD = 0.78  # Adjusted threshold
# MAX_SENTENCES = 50  # Limit sentences for plagiarism check
# TIMEOUT = 25  # Seconds before timeout

# # Health check endpoint
# @app.get("/health")
# def health_check():
#     return {"status": "healthy"}

# # Load models at startup
# try:
#     logger.info("Loading optimized AI detection model...")
#     ai_detector = pipeline(
#         "text-classification",
#         model=MODEL_NAME,
#         device=DEVICE,
#         truncation=True,
#         max_length=CHUNK_SIZE,
#         top_k=1  # Only return top prediction
#     )
    
#     logger.info("Loading optimized embedding model...")
#     tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL)
#     embed_model = AutoModel.from_pretrained(EMBEDDING_MODEL).to(DEVICE if DEVICE != -1 else "cpu")
#     embed_model.eval()  # Set to evaluation mode
    
#     logger.info("All models loaded successfully")
# except Exception as e:
#     logger.error(f"Model loading failed: {str(e)}", exc_info=True)
#     raise RuntimeError(f"Failed to initialize models: {str(e)}")

# def extract_text(pdf_bytes: bytes) -> str:
#     """Optimized PDF text extraction with timeout check."""
#     start_time = time.time()
#     try:
#         with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
#             text = []
#             for page in doc:
#                 if time.time() - start_time > TIMEOUT/2:  # Half timeout for extraction
#                     raise TimeoutError("PDF extraction taking too long")
#                 text.append(page.get_text().strip())
#             full_text = "\n".join(text).strip()
            
#             if len(full_text) < MIN_TEXT_LENGTH:
#                 raise ValueError(f"Text too short (min {MIN_TEXT_LENGTH} chars required)")
#             if len(full_text) > MAX_TEXT_LENGTH:
#                 full_text = full_text[:MAX_TEXT_LENGTH]
#             return re.sub(r'\s+', ' ', full_text)
#     except Exception as e:
#         logger.error(f"PDF processing error: {str(e)}")
#         raise HTTPException(400, "Invalid PDF content")

# def compute_embeddings(sentences: List[str]) -> np.ndarray:
#     """Optimized embedding computation with batch processing."""
#     inputs = tokenizer(
#         sentences,
#         padding=True,
#         truncation=True,
#         return_tensors="pt",
#         max_length=128  # Reduced max length for speed
#     ).to(embed_model.device)
    
#     with torch.no_grad():
#         model_output = embed_model(**inputs)
    
#     # Simplified mean pooling
#     attention_mask = inputs['attention_mask']
#     token_embeddings = model_output[0]
#     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
#     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# def check_internal_plagiarism(text: str) -> Tuple[float, bool]:
#     """Optimized plagiarism check with sentence limit."""
#     try:
#         sentences = [s for s in sent_tokenize(text) if len(s.split()) > 5][:MAX_SENTENCES]
#         if len(sentences) < 2:
#             return 0.0, False
        
#         embeddings = compute_embeddings(sentences)
#         sim_matrix = cosine_similarity(embeddings)
#         np.fill_diagonal(sim_matrix, 0)
        
#         # Only check top 10 most similar pairs for speed
#         top_indices = np.argpartition(sim_matrix.flatten(), -10)[-10:]
#         avg_similarity = np.mean(sim_matrix.flatten()[top_indices])
        
#         return round(float(avg_similarity) * 100, 2), bool(avg_similarity > PLAGIARISM_THRESHOLD)
#     except Exception as e:
#         logger.error(f"Plagiarism check failed: {str(e)}")
#         return 0.0, False

# @app.post("/detect")
# async def detect_ai_content(file: UploadFile = File(...)):
#     """Optimized detection endpoint with timeout."""
#     start_time = time.time()
    
#     try:
#         # Validate file type quickly
#         if not file.filename.lower().endswith('.pdf'):
#             raise HTTPException(400, "Only PDF files are accepted")

#         # Extract text with timeout check
#         text = extract_text(await file.read())
#         logger.info(f"Processing document with {len(text)} characters")

#         # AI Detection with timeout check
#         ai_score = 0.0
#         try:
#             result = ai_detector(text[:MAX_TEXT_LENGTH])  # Process only first MAX_TEXT_LENGTH chars
#             ai_score = result[0]['score'] if result[0]['label'] == 'LABEL_1' else 1 - result[0]['score']
#         except Exception as e:
#             logger.error(f"AI detection failed: {str(e)}")
#             raise HTTPException(500, "AI detection processing error")

#         # Plagiarism check with timeout
#         plagiarism_score, plagiarism_detected = 0.0, False
#         if time.time() - start_time < TIMEOUT - 5:  # Leave 5 seconds for response
#             plagiarism_score, plagiarism_detected = check_internal_plagiarism(text)

#         # Final timeout check
#         if time.time() - start_time > TIMEOUT:
#             raise HTTPException(500, "Analysis timed out")

#         return {
#             "ai_generated_percentage": round(float(ai_score) * 100, 2),
#             "plagiarism_risk": bool(plagiarism_detected),
#             "plagiarism_score": float(plagiarism_score),
#             "processing_time": round(time.time() - start_time, 2)
#         }

#     except HTTPException as he:
#         raise
#     except Exception as e:
#         logger.error(f"Detection error: {str(e)}", exc_info=True)
#         raise HTTPException(500, f"Analysis failed: {str(e)}")