File size: 14,001 Bytes
0da1e60
 
 
 
 
26199ad
926b2e7
 
 
 
 
a964db1
 
 
 
926b2e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a964db1
26199ad
0da1e60
 
 
 
 
926b2e7
0da1e60
926b2e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0da1e60
 
 
 
 
 
926b2e7
 
 
 
 
0da1e60
926b2e7
0da1e60
81b538b
 
 
 
 
 
 
 
 
 
926b2e7
81b538b
 
 
926b2e7
 
 
 
 
 
 
0da1e60
926b2e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0da1e60
926b2e7
 
 
 
0da1e60
926b2e7
0da1e60
926b2e7
0da1e60
926b2e7
0da1e60
926b2e7
0da1e60
926b2e7
0da1e60
926b2e7
0da1e60
926b2e7
0da1e60
926b2e7
 
 
0da1e60
926b2e7
0da1e60
926b2e7
0da1e60
926b2e7
0da1e60
926b2e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
import streamlit as st
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from peft import PeftModel, LoraConfig
import torch
import os
from concurrent.futures import ThreadPoolExecutor
import time
from datetime import datetime
from queue import Queue
import threading

class Summarizer:
    def __init__(self):
        try:
            with st.spinner("πŸ€– Initializing AI models..."):
                # Enable GPU if available
                self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
                st.info(f"Using device: {self.device}")
                
                # Load base model and tokenizer
                self.base_model = AutoModelForSeq2SeqLM.from_pretrained("GanjinZero/biobart-base").to(self.device)
                self.tokenizer = AutoTokenizer.from_pretrained("GanjinZero/biobart-base")
                self.finetuned_model = None
                
                # Create a queue for UI updates
                self.update_queue = Queue()
                
                st.success("βœ… Models ready!")
                
        except Exception as e:
            st.error(f"Error loading models: {str(e)}")
            raise

    def summarize_text(self, text, max_length=150, use_finetuned=False):
        try:
            inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
            inputs = {k: v.to(self.device) for k, v in inputs.items()}
            
            with torch.no_grad():  # Memory optimization
                if use_finetuned:
                    if self.finetuned_model is None:
                        with st.spinner("Loading fine-tuned model for final summary..."):
                            base_model_for_finetuned = AutoModelForSeq2SeqLM.from_pretrained(
                                "GanjinZero/biobart-base"
                            ).to(self.device)
                            lora_config = LoraConfig(
                                r=8,
                                lora_alpha=16,
                                target_modules=["q_proj", "v_proj"],
                                inference_mode=True
                            )
                            self.finetuned_model = PeftModel.from_pretrained(
                                base_model_for_finetuned,
                                ".",
                                config=lora_config,
                                is_trainable=False
                            )
                            self.finetuned_model.eval()
                    
                    summary_ids = self.finetuned_model.generate(
                        **inputs,
                        max_length=max_length,
                        num_beams=4,
                        length_penalty=2.0,
                        early_stopping=True
                    )
                else:
                    summary_ids = self.base_model.generate(
                        inputs["input_ids"],
                        max_length=max_length,
                        num_beams=4,
                        length_penalty=2.0,
                        early_stopping=True
                    )
            
            return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        except Exception as e:
            st.error(f"Error in summarization: {str(e)}")
            return "Error generating summary"

    def process_paper(self, row):
        """Process a single paper - used by ThreadPoolExecutor"""
        if pd.notna(row['Abstract']):
            # Put update in queue instead of directly updating UI
            self.update_queue.put(f"Processing: {row['Article Title'][:100]}...")
            
            summary = self.summarize_text(row['Abstract'], use_finetuned=False)
            
            return {
                'title': row['Article Title'],
                'authors': row['Authors'] if pd.notna(row['Authors']) else '',
                'source': row['Source Title'] if pd.notna(row['Source Title']) else '',
                'year': row['Publication Year'] if pd.notna(row['Publication Year']) else '',
                'doi': row['DOI'] if pd.notna(row['DOI']) else '',
                'document_type': row['Document Type'] if pd.notna(row['Document Type']) else '',
                'times_cited': row['Times Cited, WoS Core'] if pd.notna(row['Times Cited, WoS Core']) else 0,
                'open_access': row['Open Access Designations'] if pd.notna(row['Open Access Designations']) else '',
                'research_areas': row['Research Areas'] if pd.notna(row['Research Areas']) else '',
                'summary': summary
            }
        return None

    def ui_updater(self, status_placeholder):
        """Separate thread for updating UI"""
        while True:
            message = self.update_queue.get()
            if message == "DONE":
                break
            status_placeholder.info(message)

    def process_excel(self, file):
        try:
            start_time = time.time()
            
            # Create containers for different sections
            stats_container = st.container()
            progress_container = st.container()
            preview_container = st.container()
            status_placeholder = st.empty()
            
            with stats_container:
                col1, col2, col3 = st.columns(3)
            
            # Only load required columns - Memory optimization
            required_columns = [
                'Article Title', 'Authors', 'Abstract', 'Source Title',
                'Publication Year', 'Times Cited, WoS Core', 'DOI',
                'Document Type', 'Research Areas', 'Open Access Designations'
            ]
            df = pd.read_excel(file, usecols=required_columns)
            total_papers = len(df)
            
            # Start UI updater thread
            ui_thread = threading.Thread(target=self.ui_updater, args=(status_placeholder,))
            ui_thread.start()
            
            summaries = []
            completed_papers = 0
            
            # Process papers in parallel
            with ThreadPoolExecutor(max_workers=4) as executor:
                future_to_row = {executor.submit(self.process_paper, row): idx 
                               for idx, row in df.iterrows()}
                
                for future in future_to_row:
                    result = future.result()
                    if result:
                        summaries.append(result)
                        completed_papers += 1
                        
                        # Update stats
                        elapsed_time = time.time() - start_time
                        avg_time_per_paper = elapsed_time / completed_papers
                        estimated_time_remaining = avg_time_per_paper * (total_papers - completed_papers)
                        
                        with stats_container:
                            col1.metric("Papers Processed", f"{completed_papers}/{total_papers}")
                            col2.metric("Avg Time per Paper", f"{avg_time_per_paper:.1f}s")
                            col3.metric("Est. Time Remaining", f"{estimated_time_remaining/60:.1f}min")
                        
                        # Show latest processed paper in preview
                        with preview_container:
                            if completed_papers == 1:
                                st.markdown("### πŸ“‹ Latest Processed Papers")
                            st.success(f"βœ… Processed: {result['title'][:100]}...")
            
            # Signal UI updater thread to stop
            self.update_queue.put("DONE")
            ui_thread.join()
            
            # Generate overall summary
            with st.spinner("πŸ€– Generating final summary..."):
                combined_summaries = " ".join([s['summary'] for s in summaries])
                overall_summary = self.summarize_text(combined_summaries, max_length=250, use_finetuned=True)
            
            return summaries, overall_summary
            
        except Exception as e:
            st.error(f"Error processing Excel file: {str(e)}")
            return [], "Error generating summary"

def display_sorted_summaries(summaries, sort_by):
    """Display summaries sorted by the selected criterion"""
    # Create a copy to avoid modifying original data
    sorted_summaries = summaries.copy()
    
    # Sort based on selected criterion
    if sort_by == "Year":
        sorted_summaries.sort(key=lambda x: str(x['year']), reverse=True)
    elif sort_by == "Citations":
        sorted_summaries.sort(key=lambda x: x['times_cited'], reverse=True)
    elif sort_by == "Source":
        sorted_summaries.sort(key=lambda x: str(x['source']))
    elif sort_by == "Type":
        sorted_summaries.sort(key=lambda x: str(x['document_type']))
    elif sort_by == "Access":
        sorted_summaries.sort(key=lambda x: str(x['open_access']))
    elif sort_by == "Research Areas":
        sorted_summaries.sort(key=lambda x: str(x['research_areas']))

    # Display each paper's summary and details
    for paper in sorted_summaries:
        with st.expander(f"πŸ“„ {paper['title']} ({paper['year']})"):
            col1, col2 = st.columns([2, 1])
            
            with col1:
                st.markdown("**Summary:**")
                st.write(paper['summary'])
            
            with col2:
                st.markdown("**πŸ“ Details:**")
                details = {
                    "πŸ‘₯ Authors": paper['authors'],
                    "πŸ“° Source": paper['source'],
                    "πŸ”— DOI": paper['doi'],
                    "πŸ“„ Type": paper['document_type'],
                    "πŸ“Š Citations": paper['times_cited'],
                    "πŸ”“ Access": paper['open_access'],
                    "πŸ”¬ Areas": paper['research_areas']
                }
                
                for label, value in details.items():
                    if value:  # Only display non-empty values
                        st.write(f"**{label}:** {value}")

def main():
    """Main application function"""
    st.set_page_config(page_title="Research Paper Summarizer", layout="wide")
    
    st.title("πŸ“š Research Paper Summarizer")
    st.markdown("""
    Upload an Excel file containing research papers and get AI-generated summaries for each paper,
    plus an overall synthesis of the research.
    """)
    
    # Initialize session state
    if 'summarizer' not in st.session_state:
        st.session_state['summarizer'] = None
    if 'summaries' not in st.session_state:
        st.session_state['summaries'] = []
    if 'overall_summary' not in st.session_state:
        st.session_state['overall_summary'] = None
    
    # Sidebar inputs
    with st.sidebar:
        st.header("βš™οΈ Input Options")
        
        uploaded_file = st.file_uploader(
            "Choose an Excel file",
            type=['xlsx', 'xls'],
            help="Upload an Excel file containing research paper details including abstracts"
        )
        
        question = st.text_area(
            "Enter your research question",
            help="Enter the main research question you're investigating"
        )
        
        if uploaded_file:
            st.info(f"πŸ“ File uploaded: {uploaded_file.name}")
        
        generate_button = st.button(
            "πŸš€ Generate Summaries",
            type="primary",
            use_container_width=True,
            disabled=not (uploaded_file and question)
        )
        
        if not uploaded_file or not question:
            st.warning("⚠️ Please upload a file and enter a research question to proceed.")
    
    # Main processing
    try:
        if generate_button and uploaded_file and question:
            if st.session_state['summarizer'] is None:
                st.session_state['summarizer'] = Summarizer()
            
            summaries, overall_summary = st.session_state['summarizer'].process_excel(uploaded_file)
            st.session_state['summaries'] = summaries
            st.session_state['overall_summary'] = overall_summary
            
            # Save results for download
            if summaries:
                summary_df = pd.DataFrame(summaries)
                st.session_state['summary_df'] = summary_df
        
        # Display results if available
        if st.session_state['overall_summary']:
            st.header("πŸ“Š Overall Summary")
            st.write(st.session_state['overall_summary'])
            
            # Add download button for overall summary
            st.download_button(
                "πŸ“₯ Download Overall Summary",
                st.session_state['overall_summary'],
                "overall_summary.txt",
                "text/plain"
            )
        
        if st.session_state['summaries']:
            st.header("πŸ“‘ Individual Paper Summaries")
            
            # Sorting options
            col1, col2 = st.columns([2, 3])
            with col1:
                sort_by = st.selectbox(
                    "πŸ”„ Sort by",
                    ["Year", "Citations", "Source", "Type", "Access", "Research Areas"]
                )
            
            # Display sorted summaries
            display_sorted_summaries(st.session_state['summaries'], sort_by)
            
            # Add download button for detailed summaries
            if 'summary_df' in st.session_state:
                st.download_button(
                    "πŸ“₯ Download Detailed Summaries",
                    st.session_state['summary_df'].to_csv(index=False),
                    "detailed_summaries.csv",
                    "text/csv"
                )
    
    except Exception as e:
        st.error(f"An error occurred: {str(e)}")
        st.error("Please try again or contact support if the problem persists.")

if __name__ == "__main__":
    main()