Spaces:

pendar02
/

summarizer

Sleeping

App Files Files Community

pendar02 commited on 14 days ago

Commit

0da1e60

•

1 Parent(s): f791a84

Update app.py

Browse files

Files changed (1) hide show

app.py +167 -47

app.py CHANGED Viewed

@@ -1,73 +1,193 @@
-# At the top of app.py, add debug printing
 import os
-import logging
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
 class Summarizer:
     def __init__(self):
         try:
-            # Print current directory contents for debugging
-            st.write("Current directory contents:")
-            st.write(os.listdir('.'))
-            # Base model
-            self.base_model = AutoModelForSeq2SeqLM.from_pretrained(
-                "GanjinZero/biobart-base",
-                local_files_only=False  # Allow downloading base model
-            )
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                "GanjinZero/biobart-base",
-                local_files_only=False
-            )
-            # Load adapter config from local file
-            adapter_config_path = "./adapter_config.json"
-            if not os.path.exists(adapter_config_path):
-                st.error(f"adapter_config.json not found in {os.getcwd()}")
-                raise FileNotFoundError("adapter_config.json not found")
-            st.write(f"Loading adapter config from {adapter_config_path}")
-            # Create LoRA config
             lora_config = LoraConfig(
                 r=8,
                 lora_alpha=16,
-                lora_dropout=0.1,
-                bias="none",
-                task_type="SEQ_2_SEQ_LM",
                 target_modules=["q_proj", "v_proj"],
                 inference_mode=True
             )
-            # Load base model for fine-tuning
-            base_model_for_finetuned = AutoModelForSeq2SeqLM.from_pretrained(
-                "GanjinZero/biobart-base",
-                local_files_only=False
-            )
-            st.write("Loading fine-tuned model...")
-            # Try to load the PEFT model from the current directory
             self.finetuned_model = PeftModel.from_pretrained(
                 base_model_for_finetuned,
-                ".",  # Current directory
                 config=lora_config,
-                torch_dtype=torch.float32,
-                is_trainable=False,
-                local_files_only=True
             )
             self.finetuned_model.eval()
             st.success("Models loaded successfully!")
         except Exception as e:
             st.error(f"Error loading models: {str(e)}")
             st.write("Debug info:")
-            st.write(f"Current working directory: {os.getcwd()}")
-            st.write(f"Directory contents: {os.listdir('.')}")
-            if os.path.exists('adapter_config.json'):
-                with open('adapter_config.json', 'r') as f:
-                    st.write("adapter_config.json contents:", f.read())
-            raise

+import streamlit as st
+import pandas as pd
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+from peft import PeftModel, LoraConfig
+import torch
 import os
+# Configure logging and page
+st.set_page_config(page_title="Research Paper Summarizer", layout="wide")
 class Summarizer:
     def __init__(self):
         try:
+            st.info("Loading models... Please wait.")
+            # Load base model and tokenizer
+            self.base_model = AutoModelForSeq2SeqLM.from_pretrained("GanjinZero/biobart-base")
+            self.tokenizer = AutoTokenizer.from_pretrained("GanjinZero/biobart-base")
+            # Debug info
+            st.write("Current directory contents:", os.listdir())
+            # Load fine-tuned model
+            base_model_for_finetuned = AutoModelForSeq2SeqLM.from_pretrained("GanjinZero/biobart-base")
+            # Configure LoRA
             lora_config = LoraConfig(
                 r=8,
                 lora_alpha=16,
                 target_modules=["q_proj", "v_proj"],
                 inference_mode=True
             )
+            # Load PEFT model
             self.finetuned_model = PeftModel.from_pretrained(
                 base_model_for_finetuned,
+                ".",
                 config=lora_config,
+                is_trainable=False
             )
             self.finetuned_model.eval()
             st.success("Models loaded successfully!")
         except Exception as e:
             st.error(f"Error loading models: {str(e)}")
             st.write("Debug info:")
+            st.write(f"Working directory: {os.getcwd()}")
+            st.write(f"Files available: {os.listdir()}")
+            raise
+    def summarize_text(self, text, max_length=150, use_finetuned=False):
+        try:
+            inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
+            if use_finetuned:
+                summary_ids = self.finetuned_model.generate(
+                    **inputs,
+                    max_length=max_length,
+                    num_beams=4,
+                    length_penalty=2.0,
+                    early_stopping=True
+                )
+            else:
+                summary_ids = self.base_model.generate(
+                    inputs["input_ids"],
+                    max_length=max_length,
+                    num_beams=4,
+                    length_penalty=2.0,
+                    early_stopping=True
+                )
+            return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+        except Exception as e:
+            st.error(f"Error in summarization: {str(e)}")
+            return "Error generating summary"
+    def process_excel(self, file, question):
+        try:
+            df = pd.read_excel(file)
+            summaries = []
+            progress_bar = st.progress(0)
+            total_rows = len(df)
+            for idx, row in df.iterrows():
+                if pd.notna(row['Abstract']):
+                    progress_bar.progress((idx + 1) / total_rows)
+                    paper_info = {
+                        'title': row['Article Title'],
+                        'authors': row['Authors'] if pd.notna(row['Authors']) else '',
+                        'source': row['Source Title'] if pd.notna(row['Source Title']) else '',
+                        'year': row['Publication Year'] if pd.notna(row['Publication Year']) else '',
+                        'doi': row['DOI'] if pd.notna(row['DOI']) else '',
+                        'document_type': row['Document Type'] if pd.notna(row['Document Type']) else '',
+                        'times_cited': row['Times Cited, WoS Core'] if pd.notna(row['Times Cited, WoS Core']) else 0,
+                        'open_access': row['Open Access Designations'] if pd.notna(row['Open Access Designations']) else '',
+                        'research_areas': row['Research Areas'] if pd.notna(row['Research Areas']) else '',
+                        'summary': self.summarize_text(row['Abstract'], use_finetuned=False)
+                    }
+                    summaries.append(paper_info)
+            # Generate overall summary
+            combined_summaries = " ".join([s['summary'] for s in summaries])
+            overall_summary = self.summarize_text(combined_summaries, max_length=250, use_finetuned=True)
+            return summaries, overall_summary
+        except Exception as e:
+            st.error(f"Error processing Excel file: {str(e)}")
+            return [], "Error generating summary"
+# Initialize session state
+if 'summarizer' not in st.session_state:
+    st.session_state['summarizer'] = None
+if 'summaries' not in st.session_state:
+    st.session_state['summaries'] = None
+if 'overall_summary' not in st.session_state:
+    st.session_state['overall_summary'] = None
+# App header
+st.title("Research Paper Summarizer")
+st.write("Upload an Excel file with research papers to generate summaries")
+# Sidebar inputs
+with st.sidebar:
+    st.header("Input Options")
+    uploaded_file = st.file_uploader("Choose an Excel file", type=['xlsx', 'xls'])
+    question = st.text_area("Enter your research question")
+    generate_button = st.button("Generate Summaries", type="primary", use_container_width=True)
+# Main processing
+if generate_button and uploaded_file and question:
+    try:
+        if st.session_state['summarizer'] is None:
+            st.session_state['summarizer'] = Summarizer()
+        with st.spinner("Processing papers..."):
+            summaries, overall_summary = st.session_state['summarizer'].process_excel(uploaded_file, question)
+            st.session_state['summaries'] = summaries
+            st.session_state['overall_summary'] = overall_summary
+    except Exception as e:
+        st.error(f"An error occurred: {str(e)}")
+elif generate_button:
+    st.warning("Please upload a file and enter a research question.")
+# Display results
+if st.session_state['overall_summary']:
+    st.header("Overall Summary")
+    st.write(st.session_state['overall_summary'])
+if st.session_state['summaries']:
+    st.header("Individual Paper Summaries")
+    # Sorting options
+    col1, _ = st.columns([2, 3])
+    with col1:
+        sort_by = st.selectbox(
+            "Sort by",
+            ["Year", "Citations", "Source", "Type", "Access", "Research Areas"],
+            index=0
+        )
+    # Sort summaries
+    summaries = st.session_state['summaries']
+    if sort_by == "Year":
+        summaries.sort(key=lambda x: str(x['year']), reverse=True)
+    elif sort_by == "Citations":
+        summaries.sort(key=lambda x: x['times_cited'], reverse=True)
+    elif sort_by == "Source":
+        summaries.sort(key=lambda x: str(x['source']))
+    elif sort_by == "Type":
+        summaries.sort(key=lambda x: str(x['document_type']))
+    elif sort_by == "Access":
+        summaries.sort(key=lambda x: str(x['open_access']))
+    elif sort_by == "Research Areas":
+        summaries.sort(key=lambda x: str(x['research_areas']))
+    # Display summaries
+    for paper in summaries:
+        with st.expander(f"{paper['title']} ({paper['year']})"):
+            col1, col2 = st.columns([2, 1])
+            with col1:
+                st.write("**Summary:**")
+                st.write(paper['summary'])
+            with col2:
+                st.write(f"**Authors:** {paper['authors']}")
+                st.write(f"**Source:** {paper['source']}")
+                st.write(f"**DOI:** {paper['doi']}")
+                st.write(f"**Document Type:** {paper['document_type']}")
+                st.write(f"**Times Cited:** {paper['times_cited']}")
+                st.write(f"**Open Access:** {paper['open_access']}")
+                st.write(f"**Research Areas:** {paper['research_areas']}")