import streamlit as st
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import numpy as np
import fitz  # PyMuPDF
import pandas as pd
import io

# Load the model and tokenizer from Hugging Face
model_name = "KevSun/Engessay_grading_ML"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Streamlit app
st.title("Automated Scoring App")
st.write("Enter your English essay below to predict scores from multiple dimensions:")

# Replace text input with file uploader
uploaded_file = st.file_uploader("Upload your PDF essay:", type=['pdf'])

if uploaded_file:
    # Convert uploaded file to bytes for fitz
    pdf_bytes = uploaded_file.read()
    
    # Read and display PDF content
    with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
        text_content = ""
        for page in doc:
            text_content += page.get_text()
    
    # Display the extracted text
    st.write("Extracted text from PDF:")
    st.text_area("PDF Content", text_content, height=200, disabled=True)

if st.button("Predict"):
    if uploaded_file:
        # Use the already extracted text_content for prediction
        # Tokenize input text with truncation
        inputs = tokenizer(
            text_content,
            return_tensors="pt",
            truncation=True,
            max_length=512  # Standard BERT/RoBERTa max length
        )
        
        # After tokenization
        token_count = len(inputs['input_ids'][0])
        if token_count == 512:
            st.warning("⚠️ The text was too long and has been truncated to fit the model's maximum length. This might affect the accuracy of the predictions.")
        
        # Get predictions from the model
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Extract and process predictions
        predictions = outputs.logits.squeeze()
        predicted_scores = predictions.numpy()
        
        # Scale the predictions
        scaled_scores = 2.25 * predicted_scores - 1.25
        rounded_scores = [round(score * 2) / 2 for score in scaled_scores]
        
        # Create results DataFrame
        labels = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
        results_dict = {
            'Dimension': labels,
            'Score': rounded_scores
        }
        df = pd.DataFrame(results_dict)
        
        # Display results in app
        st.write("Scores:")
        st.dataframe(df)
        
        # Save CSV locally
        local_path = "essay_scores.csv"
        df.to_csv(local_path, index=False)
        st.success(f"Results saved locally to {local_path}")
        
        # Create download button for CSV
        csv = df.to_csv(index=False)
        st.download_button(
            label="Download results as CSV",
            data=csv,
            file_name="essay_scores.csv",
            mime="text/csv"
        )
    else:
        st.write("Please upload a PDF file to get scores.")