Spaces:

pendar02
/

summarizer

Sleeping

File size: 14,001 Bytes

import streamlit as st
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from peft import PeftModel, LoraConfig
import torch
import os
from concurrent.futures import ThreadPoolExecutor
import time
from datetime import datetime
from queue import Queue
import threading

class Summarizer:
    def __init__(self):
        try:
            with st.spinner("🤖 Initializing AI models..."):
                # Enable GPU if available
                self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
                st.info(f"Using device: {self.device}")
                
                # Load base model and tokenizer
                self.base_model = AutoModelForSeq2SeqLM.from_pretrained("GanjinZero/biobart-base").to(self.device)
                self.tokenizer = AutoTokenizer.from_pretrained("GanjinZero/biobart-base")
                self.finetuned_model = None
                
                # Create a queue for UI updates
                self.update_queue = Queue()
                
                st.success("✅ Models ready!")
                
        except Exception as e:
            st.error(f"Error loading models: {str(e)}")
            raise

    def summarize_text(self, text, max_length=150, use_finetuned=False):
        try:
            inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
            inputs = {k: v.to(self.device) for k, v in inputs.items()}
            
            with torch.no_grad():  # Memory optimization
                if use_finetuned:
                    if self.finetuned_model is None:
                        with st.spinner("Loading fine-tuned model for final summary..."):
                            base_model_for_finetuned = AutoModelForSeq2SeqLM.from_pretrained(
                                "GanjinZero/biobart-base"
                            ).to(self.device)
                            lora_config = LoraConfig(
                                r=8,
                                lora_alpha=16,
                                target_modules=["q_proj", "v_proj"],
                                inference_mode=True
                            )
                            self.finetuned_model = PeftModel.from_pretrained(
                                base_model_for_finetuned,
                                ".",
                                config=lora_config,
                                is_trainable=False
                            )
                            self.finetuned_model.eval()
                    
                    summary_ids = self.finetuned_model.generate(
                        **inputs,
                        max_length=max_length,
                        num_beams=4,
                        length_penalty=2.0,
                        early_stopping=True
                    )
                else:
                    summary_ids = self.base_model.generate(
                        inputs["input_ids"],
                        max_length=max_length,
                        num_beams=4,
                        length_penalty=2.0,
                        early_stopping=True
                    )
            
            return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        except Exception as e:
            st.error(f"Error in summarization: {str(e)}")
            return "Error generating summary"

    def process_paper(self, row):
        """Process a single paper - used by ThreadPoolExecutor"""
        if pd.notna(row['Abstract']):
            # Put update in queue instead of directly updating UI
            self.update_queue.put(f"Processing: {row['Article Title'][:100]}...")
            
            summary = self.summarize_text(row['Abstract'], use_finetuned=False)
            
            return {
                'title': row['Article Title'],
                'authors': row['Authors'] if pd.notna(row['Authors']) else '',
                'source': row['Source Title'] if pd.notna(row['Source Title']) else '',
                'year': row['Publication Year'] if pd.notna(row['Publication Year']) else '',
                'doi': row['DOI'] if pd.notna(row['DOI']) else '',
                'document_type': row['Document Type'] if pd.notna(row['Document Type']) else '',
                'times_cited': row['Times Cited, WoS Core'] if pd.notna(row['Times Cited, WoS Core']) else 0,
                'open_access': row['Open Access Designations'] if pd.notna(row['Open Access Designations']) else '',
                'research_areas': row['Research Areas'] if pd.notna(row['Research Areas']) else '',
                'summary': summary
            }
        return None

    def ui_updater(self, status_placeholder):
        """Separate thread for updating UI"""
        while True:
            message = self.update_queue.get()
            if message == "DONE":
                break
            status_placeholder.info(message)

    def process_excel(self, file):
        try:
            start_time = time.time()
            
            # Create containers for different sections
            stats_container = st.container()
            progress_container = st.container()
            preview_container = st.container()
            status_placeholder = st.empty()
            
            with stats_container:
                col1, col2, col3 = st.columns(3)
            
            # Only load required columns - Memory optimization
            required_columns = [
                'Article Title', 'Authors', 'Abstract', 'Source Title',
                'Publication Year', 'Times Cited, WoS Core', 'DOI',
                'Document Type', 'Research Areas', 'Open Access Designations'
            ]
            df = pd.read_excel(file, usecols=required_columns)
            total_papers = len(df)
            
            # Start UI updater thread
            ui_thread = threading.Thread(target=self.ui_updater, args=(status_placeholder,))
            ui_thread.start()
            
            summaries = []
            completed_papers = 0
            
            # Process papers in parallel
            with ThreadPoolExecutor(max_workers=4) as executor:
                future_to_row = {executor.submit(self.process_paper, row): idx 
                               for idx, row in df.iterrows()}
                
                for future in future_to_row:
                    result = future.result()
                    if result:
                        summaries.append(result)
                        completed_papers += 1
                        
                        # Update stats
                        elapsed_time = time.time() - start_time
                        avg_time_per_paper = elapsed_time / completed_papers
                        estimated_time_remaining = avg_time_per_paper * (total_papers - completed_papers)
                        
                        with stats_container:
                            col1.metric("Papers Processed", f"{completed_papers}/{total_papers}")
                            col2.metric("Avg Time per Paper", f"{avg_time_per_paper:.1f}s")
                            col3.metric("Est. Time Remaining", f"{estimated_time_remaining/60:.1f}min")
                        
                        # Show latest processed paper in preview
                        with preview_container:
                            if completed_papers == 1:
                                st.markdown("### 📋 Latest Processed Papers")
                            st.success(f"✅ Processed: {result['title'][:100]}...")
            
            # Signal UI updater thread to stop
            self.update_queue.put("DONE")
            ui_thread.join()
            
            # Generate overall summary
            with st.spinner("🤖 Generating final summary..."):
                combined_summaries = " ".join([s['summary'] for s in summaries])
                overall_summary = self.summarize_text(combined_summaries, max_length=250, use_finetuned=True)
            
            return summaries, overall_summary
            
        except Exception as e:
            st.error(f"Error processing Excel file: {str(e)}")
            return [], "Error generating summary"

def display_sorted_summaries(summaries, sort_by):
    """Display summaries sorted by the selected criterion"""
    # Create a copy to avoid modifying original data
    sorted_summaries = summaries.copy()
    
    # Sort based on selected criterion
    if sort_by == "Year":
        sorted_summaries.sort(key=lambda x: str(x['year']), reverse=True)
    elif sort_by == "Citations":
        sorted_summaries.sort(key=lambda x: x['times_cited'], reverse=True)
    elif sort_by == "Source":
        sorted_summaries.sort(key=lambda x: str(x['source']))
    elif sort_by == "Type":
        sorted_summaries.sort(key=lambda x: str(x['document_type']))
    elif sort_by == "Access":
        sorted_summaries.sort(key=lambda x: str(x['open_access']))
    elif sort_by == "Research Areas":
        sorted_summaries.sort(key=lambda x: str(x['research_areas']))

    # Display each paper's summary and details
    for paper in sorted_summaries:
        with st.expander(f"📄 {paper['title']} ({paper['year']})"):
            col1, col2 = st.columns([2, 1])
            
            with col1:
                st.markdown("**Summary:**")
                st.write(paper['summary'])
            
            with col2:
                st.markdown("**📝 Details:**")
                details = {
                    "👥 Authors": paper['authors'],
                    "📰 Source": paper['source'],
                    "🔗 DOI": paper['doi'],
                    "📄 Type": paper['document_type'],
                    "📊 Citations": paper['times_cited'],
                    "🔓 Access": paper['open_access'],
                    "🔬 Areas": paper['research_areas']
                }
                
                for label, value in details.items():
                    if value:  # Only display non-empty values
                        st.write(f"**{label}:** {value}")

def main():
    """Main application function"""
    st.set_page_config(page_title="Research Paper Summarizer", layout="wide")
    
    st.title("📚 Research Paper Summarizer")
    st.markdown("""
    Upload an Excel file containing research papers and get AI-generated summaries for each paper,
    plus an overall synthesis of the research.
    """)
    
    # Initialize session state
    if 'summarizer' not in st.session_state:
        st.session_state['summarizer'] = None
    if 'summaries' not in st.session_state:
        st.session_state['summaries'] = []
    if 'overall_summary' not in st.session_state:
        st.session_state['overall_summary'] = None
    
    # Sidebar inputs
    with st.sidebar:
        st.header("⚙️ Input Options")
        
        uploaded_file = st.file_uploader(
            "Choose an Excel file",
            type=['xlsx', 'xls'],
            help="Upload an Excel file containing research paper details including abstracts"
        )
        
        question = st.text_area(
            "Enter your research question",
            help="Enter the main research question you're investigating"
        )
        
        if uploaded_file:
            st.info(f"📁 File uploaded: {uploaded_file.name}")
        
        generate_button = st.button(
            "🚀 Generate Summaries",
            type="primary",
            use_container_width=True,
            disabled=not (uploaded_file and question)
        )
        
        if not uploaded_file or not question:
            st.warning("⚠️ Please upload a file and enter a research question to proceed.")
    
    # Main processing
    try:
        if generate_button and uploaded_file and question:
            if st.session_state['summarizer'] is None:
                st.session_state['summarizer'] = Summarizer()
            
            summaries, overall_summary = st.session_state['summarizer'].process_excel(uploaded_file)
            st.session_state['summaries'] = summaries
            st.session_state['overall_summary'] = overall_summary
            
            # Save results for download
            if summaries:
                summary_df = pd.DataFrame(summaries)
                st.session_state['summary_df'] = summary_df
        
        # Display results if available
        if st.session_state['overall_summary']:
            st.header("📊 Overall Summary")
            st.write(st.session_state['overall_summary'])
            
            # Add download button for overall summary
            st.download_button(
                "📥 Download Overall Summary",
                st.session_state['overall_summary'],
                "overall_summary.txt",
                "text/plain"
            )
        
        if st.session_state['summaries']:
            st.header("📑 Individual Paper Summaries")
            
            # Sorting options
            col1, col2 = st.columns([2, 3])
            with col1:
                sort_by = st.selectbox(
                    "🔄 Sort by",
                    ["Year", "Citations", "Source", "Type", "Access", "Research Areas"]
                )
            
            # Display sorted summaries
            display_sorted_summaries(st.session_state['summaries'], sort_by)
            
            # Add download button for detailed summaries
            if 'summary_df' in st.session_state:
                st.download_button(
                    "📥 Download Detailed Summaries",
                    st.session_state['summary_df'].to_csv(index=False),
                    "detailed_summaries.csv",
                    "text/csv"
                )
    
    except Exception as e:
        st.error(f"An error occurred: {str(e)}")
        st.error("Please try again or contact support if the problem persists.")

if __name__ == "__main__":
    main()