Spaces:

Essay-Grader
/

Detection_and_Plagiarism_Check

Running

Detection_and_Plagiarism_Check / main.py

COM ADMIN

Fixed

1c12f42 16 days ago

13.4 kB

	import os
	import re
	import time
	import logging
	from pathlib import Path
	from typing import List, Tuple

	from fastapi import FastAPI, UploadFile, File, HTTPException
	from fastapi.middleware.cors import CORSMiddleware

	import fitz # PyMuPDF
	import torch
	import numpy as np
	import nltk
	import asyncio

	from transformers import (
	AutoTokenizer,
	AutoModelForSequenceClassification,
	AutoModel,
	)
	from nltk.tokenize import sent_tokenize
	from sklearn.metrics.pairwise import cosine_similarity

	# Setup cache
	os.environ["TRANSFORMERS_CACHE"] = "/tmp/.cache/huggingface"
	os.environ["HF_HOME"] = "/tmp/.cache/huggingface"
	os.environ["NLTK_DATA"] = "/tmp/.cache/nltk"
	Path("/tmp/.cache/huggingface").mkdir(parents=True, exist_ok=True)
	Path("/tmp/.cache/nltk").mkdir(parents=True, exist_ok=True)

	# Logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# NLTK init
	try:
	nltk.data.path.append("/tmp/.cache/nltk")
	nltk.data.find("tokenizers/punkt")
	except LookupError:
	nltk.download("punkt", download_dir="/tmp/.cache/nltk")
	nltk.data.path.append("/tmp/.cache/nltk")

	# App init
	app = FastAPI()
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_methods=["POST"],
	allow_headers=["*"],
	)

	# Model configs
	MODEL_NAME = "Essay-Grader/roberta-ai-detector-20250401_232702"
	EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
	DEVICE = 0 if torch.cuda.is_available() else -1
	MAX_TEXT_LENGTH = 10000
	AI_CHUNK_SIZE = 512
	PLAGIARISM_THRESHOLD = 0.75
	TIMEOUT = 25 # total timeout buffer

	# Load models
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
	ai_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(
	DEVICE if DEVICE != -1 else "cpu"
	)
	ai_model.eval()

	embed_tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL)
	embed_model = AutoModel.from_pretrained(EMBEDDING_MODEL).to(
	DEVICE if DEVICE != -1 else "cpu"
	)
	embed_model.eval()

	# Health check
	# @app.get("/health")
	# def health_check():
	# return {"status": "healthy"}


	def extract_text(pdf_bytes: bytes) -> str:
	try:
	with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
	text = []
	for page in doc:
	page_text = page.get_text().strip()
	if "reference" in page_text.lower():
	break # Exclude reference section
	text.append(page_text)

	full_text = re.sub(r"\s+", " ", "\n".join(text))[:MAX_TEXT_LENGTH]
	if len(full_text) < 150:
	raise ValueError("Text too short")
	return full_text
	except Exception as e:
	logger.error(f"PDF error: {str(e)}")
	raise HTTPException(400, "Invalid PDF")


	def predict_ai(text: str) -> float:
	inputs = tokenizer(
	text,
	truncation=True,
	max_length=AI_CHUNK_SIZE,
	return_tensors="pt",
	).to(ai_model.device)

	with torch.no_grad():
	outputs = ai_model(**inputs)
	probs = torch.softmax(outputs.logits, dim=1)
	return float(probs[0][1]) # AI-generated probability


	def compute_embeddings(sentences: List[str]) -> np.ndarray:
	inputs = embed_tokenizer(
	sentences,
	padding=True,
	truncation=True,
	max_length=128,
	return_tensors="pt",
	).to(embed_model.device)

	with torch.no_grad():
	outputs = embed_model(**inputs)

	attention_mask = inputs["attention_mask"]
	last_hidden = outputs.last_hidden_state
	return (last_hidden * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(
	1, keepdim=True
	)


	def check_plagiarism(text: str) -> Tuple[float, bool]:
	try:
	sentences = [
	s for s in sent_tokenize(text) if 5 < len(s.split()) < 100
	][:40] # limit
	if len(sentences) < 2:
	return 0.0, False

	embeddings = compute_embeddings(sentences).cpu().numpy()
	sim_matrix = cosine_similarity(embeddings)
	np.fill_diagonal(sim_matrix, 0)

	n = len(sim_matrix)
	top_k = max(1, int(0.1 * n * (n - 1) / 2))
	top_indices = np.argpartition(sim_matrix.flatten(), -top_k)[-top_k:]
	avg_similarity = float(np.mean(sim_matrix.flatten()[top_indices]))

	return round(avg_similarity * 100, 2), avg_similarity > PLAGIARISM_THRESHOLD
	except Exception as e:
	logger.error(f"Plagiarism check error: {str(e)}")
	return 0.0, False


	@app.post("/detect")
	async def detect_ai_and_plagiarism(file: UploadFile = File(...)):
	start_time = time.time()

	try:
	if not file.filename.lower().endswith(".pdf"):
	raise HTTPException(400, "Only PDF files allowed")

	pdf_data = await file.read()
	text = extract_text(pdf_data)

	async def run_ai():
	return predict_ai(text)

	async def run_plagiarism():
	return check_plagiarism(text)

	ai_future = asyncio.create_task(run_ai())
	plagiarism_future = asyncio.create_task(run_plagiarism())

	ai_score, (plag_score, plag_risk) = await asyncio.gather(
	ai_future, plagiarism_future
	)

	total_time = time.time() - start_time
	if total_time > TIMEOUT:
	raise HTTPException(500, "Processing timed out")

	return {
	"ai_generated_percentage": round(ai_score * 100, 2),
	"plagiarism_percentage": plag_score,
	# "plagiarism_risk": plag_risk
	}

	except Exception as e:
	logger.error(f"Error: {str(e)}", exc_info=True)
	raise HTTPException(500, f"Processing failed: {str(e)}")





	# # main.py: Optimized AI Detection and Plagiarism Check API

	# import os
	# from pathlib import Path
	# import logging
	# from typing import List, Tuple
	# import re
	# import time

	# # Configure cache directories first
	# os.environ["TRANSFORMERS_CACHE"] = "/tmp/.cache/huggingface"
	# os.environ["HF_HOME"] = "/tmp/.cache/huggingface"
	# os.environ["NLTK_DATA"] = "/tmp/.cache/nltk"

	# # Create cache directories
	# Path("/tmp/.cache/huggingface").mkdir(parents=True, exist_ok=True)
	# Path("/tmp/.cache/nltk").mkdir(parents=True, exist_ok=True)

	# # Import remaining dependencies
	# from fastapi import FastAPI, UploadFile, File, HTTPException
	# from fastapi.middleware.cors import CORSMiddleware
	# from transformers import pipeline, AutoTokenizer, AutoModel
	# import fitz # PyMuPDF
	# import torch
	# import numpy as np
	# import nltk
	# from nltk.tokenize import sent_tokenize
	# from sklearn.metrics.pairwise import cosine_similarity

	# # Configure logging
	# logging.basicConfig(level=logging.INFO)
	# logger = logging.getLogger(__name__)

	# # Initialize NLTK data
	# try:
	# nltk.data.path.append("/tmp/.cache/nltk")
	# nltk.data.find('tokenizers/punkt')
	# logger.info("NLTK punkt tokenizer available")
	# except LookupError:
	# logger.info("Downloading NLTK punkt tokenizer...")
	# nltk.download('punkt', download_dir="/tmp/.cache/nltk")
	# nltk.data.path.append("/tmp/.cache/nltk")

	# app = FastAPI()

	# app.add_middleware(
	# CORSMiddleware,
	# allow_origins=["*"],
	# allow_methods=["POST"],
	# allow_headers=["*"],
	# )

	# # Configuration - Optimized for speed and accuracy
	# # https://huggingface.co/Essay-Grader/roberta-ai-detector-20250401_232702
	# # roberta-base-openai-detector

	# MODEL_NAME = "Essay-Grader/roberta-ai-detector-20250401_232702" # More accurate model
	# EMBEDDING_MODEL = "sentence-transformers/paraphrase-MiniLM-L3-v2" # Faster model
	# DEVICE = 0 if torch.cuda.is_available() else -1
	# CHUNK_SIZE = 768 # Increased chunk size for better performance
	# MIN_TEXT_LENGTH = 150
	# MAX_TEXT_LENGTH = 8000 # Reduced for faster processing
	# PLAGIARISM_THRESHOLD = 0.78 # Adjusted threshold
	# MAX_SENTENCES = 50 # Limit sentences for plagiarism check
	# TIMEOUT = 25 # Seconds before timeout

	# # Health check endpoint
	# @app.get("/health")
	# def health_check():
	# return {"status": "healthy"}

	# # Load models at startup
	# try:
	# logger.info("Loading optimized AI detection model...")
	# ai_detector = pipeline(
	# "text-classification",
	# model=MODEL_NAME,
	# device=DEVICE,
	# truncation=True,
	# max_length=CHUNK_SIZE,
	# top_k=1 # Only return top prediction
	# )

	# logger.info("Loading optimized embedding model...")
	# tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL)
	# embed_model = AutoModel.from_pretrained(EMBEDDING_MODEL).to(DEVICE if DEVICE != -1 else "cpu")
	# embed_model.eval() # Set to evaluation mode

	# logger.info("All models loaded successfully")
	# except Exception as e:
	# logger.error(f"Model loading failed: {str(e)}", exc_info=True)
	# raise RuntimeError(f"Failed to initialize models: {str(e)}")

	# def extract_text(pdf_bytes: bytes) -> str:
	# """Optimized PDF text extraction with timeout check."""
	# start_time = time.time()
	# try:
	# with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
	# text = []
	# for page in doc:
	# if time.time() - start_time > TIMEOUT/2: # Half timeout for extraction
	# raise TimeoutError("PDF extraction taking too long")
	# text.append(page.get_text().strip())
	# full_text = "\n".join(text).strip()

	# if len(full_text) < MIN_TEXT_LENGTH:
	# raise ValueError(f"Text too short (min {MIN_TEXT_LENGTH} chars required)")
	# if len(full_text) > MAX_TEXT_LENGTH:
	# full_text = full_text[:MAX_TEXT_LENGTH]
	# return re.sub(r'\s+', ' ', full_text)
	# except Exception as e:
	# logger.error(f"PDF processing error: {str(e)}")
	# raise HTTPException(400, "Invalid PDF content")

	# def compute_embeddings(sentences: List[str]) -> np.ndarray:
	# """Optimized embedding computation with batch processing."""
	# inputs = tokenizer(
	# sentences,
	# padding=True,
	# truncation=True,
	# return_tensors="pt",
	# max_length=128 # Reduced max length for speed
	# ).to(embed_model.device)

	# with torch.no_grad():
	# model_output = embed_model(**inputs)

	# # Simplified mean pooling
	# attention_mask = inputs['attention_mask']
	# token_embeddings = model_output[0]
	# input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
	# return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

	# def check_internal_plagiarism(text: str) -> Tuple[float, bool]:
	# """Optimized plagiarism check with sentence limit."""
	# try:
	# sentences = [s for s in sent_tokenize(text) if len(s.split()) > 5][:MAX_SENTENCES]
	# if len(sentences) < 2:
	# return 0.0, False

	# embeddings = compute_embeddings(sentences)
	# sim_matrix = cosine_similarity(embeddings)
	# np.fill_diagonal(sim_matrix, 0)

	# # Only check top 10 most similar pairs for speed
	# top_indices = np.argpartition(sim_matrix.flatten(), -10)[-10:]
	# avg_similarity = np.mean(sim_matrix.flatten()[top_indices])

	# return round(float(avg_similarity) * 100, 2), bool(avg_similarity > PLAGIARISM_THRESHOLD)
	# except Exception as e:
	# logger.error(f"Plagiarism check failed: {str(e)}")
	# return 0.0, False

	# @app.post("/detect")
	# async def detect_ai_content(file: UploadFile = File(...)):
	# """Optimized detection endpoint with timeout."""
	# start_time = time.time()

	# try:
	# # Validate file type quickly
	# if not file.filename.lower().endswith('.pdf'):
	# raise HTTPException(400, "Only PDF files are accepted")

	# # Extract text with timeout check
	# text = extract_text(await file.read())
	# logger.info(f"Processing document with {len(text)} characters")

	# # AI Detection with timeout check
	# ai_score = 0.0
	# try:
	# result = ai_detector(text[:MAX_TEXT_LENGTH]) # Process only first MAX_TEXT_LENGTH chars
	# ai_score = result[0]['score'] if result[0]['label'] == 'LABEL_1' else 1 - result[0]['score']
	# except Exception as e:
	# logger.error(f"AI detection failed: {str(e)}")
	# raise HTTPException(500, "AI detection processing error")

	# # Plagiarism check with timeout
	# plagiarism_score, plagiarism_detected = 0.0, False
	# if time.time() - start_time < TIMEOUT - 5: # Leave 5 seconds for response
	# plagiarism_score, plagiarism_detected = check_internal_plagiarism(text)

	# # Final timeout check
	# if time.time() - start_time > TIMEOUT:
	# raise HTTPException(500, "Analysis timed out")

	# return {
	# "ai_generated_percentage": round(float(ai_score) * 100, 2),
	# "plagiarism_risk": bool(plagiarism_detected),
	# "plagiarism_score": float(plagiarism_score),
	# "processing_time": round(time.time() - start_time, 2)
	# }

	# except HTTPException as he:
	# raise
	# except Exception as e:
	# logger.error(f"Detection error: {str(e)}", exc_info=True)
	# raise HTTPException(500, f"Analysis failed: {str(e)}")