Kiruthick18 commited on
Commit
8ed6134
·
verified ·
1 Parent(s): 2d9c959

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -298
app.py DELETED
@@ -1,298 +0,0 @@
1
- import gradio as gr
2
- import PyPDF2
3
- import io
4
- from transformers import pipeline, AutoTokenizer
5
- import torch
6
- import re
7
- from typing import List, Tuple
8
- import warnings
9
- warnings.filterwarnings("ignore")
10
-
11
- class PDFSummarizer:
12
- def __init__(self):
13
- # Use a much faster, lighter model for summarization
14
- self.model_name = "sshleifer/distilbart-cnn-12-6" # Much faster than BART-large
15
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
16
- print(f"Using device: {self.device}")
17
-
18
- try:
19
- # Initialize the summarization pipeline with optimizations
20
- self.summarizer = pipeline(
21
- "summarization",
22
- model=self.model_name,
23
- device=0 if self.device == "cuda" else -1,
24
- framework="pt",
25
- model_kwargs={"torch_dtype": torch.float16 if self.device == "cuda" else torch.float32}
26
- )
27
-
28
- # Initialize tokenizer for length calculations
29
- self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
30
- print("Model loaded successfully")
31
-
32
- except Exception as e:
33
- print(f"Error loading model: {e}")
34
- # Fallback to an even faster model
35
- self.model_name = "facebook/bart-large-cnn"
36
- self.summarizer = pipeline("summarization", model=self.model_name, device=-1)
37
- self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
38
- print("Fallback model loaded")
39
-
40
- def extract_text_from_pdf(self, pdf_file) -> str:
41
- """Extract text content from PDF file"""
42
- try:
43
- pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
44
- text = ""
45
-
46
- for page_num, page in enumerate(pdf_reader.pages):
47
- page_text = page.extract_text()
48
- if page_text.strip():
49
- text += f"\n--- Page {page_num + 1} ---\n"
50
- text += page_text
51
-
52
- return text.strip()
53
- except Exception as e:
54
- raise Exception(f"Error extracting text from PDF: {str(e)}")
55
-
56
- def clean_text(self, text: str) -> str:
57
- """Clean and preprocess text"""
58
- # Remove extra whitespaces and newlines
59
- text = re.sub(r'\s+', ' ', text)
60
- # Remove special characters but keep punctuation
61
- text = re.sub(r'[^\w\s.,!?;:()\-"]', ' ', text)
62
- # Remove page markers
63
- text = re.sub(r'--- Page \d+ ---', '', text)
64
- return text.strip()
65
-
66
- def chunk_text(self, text: str, max_chunk_length: int = 512) -> List[str]:
67
- """Split text into smaller, more manageable chunks for faster processing"""
68
- sentences = text.split('. ')
69
- chunks = []
70
- current_chunk = ""
71
-
72
- for sentence in sentences:
73
- # Check if adding this sentence would exceed the limit
74
- potential_chunk = current_chunk + sentence + ". "
75
- # Use faster length estimation
76
- if len(potential_chunk.split()) <= max_chunk_length:
77
- current_chunk = potential_chunk
78
- else:
79
- if current_chunk:
80
- chunks.append(current_chunk.strip())
81
- current_chunk = sentence + ". "
82
-
83
- if current_chunk:
84
- chunks.append(current_chunk.strip())
85
-
86
- # Limit number of chunks for speed
87
- return chunks[:5] # Process max 5 chunks for speed
88
-
89
- def summarize_chunk(self, chunk: str, max_length: int = 100, min_length: int = 30) -> str:
90
- """Summarize a single chunk of text with speed optimizations"""
91
- try:
92
- # Speed optimizations
93
- summary = self.summarizer(
94
- chunk,
95
- max_length=max_length,
96
- min_length=min_length,
97
- do_sample=False,
98
- truncation=True,
99
- early_stopping=True,
100
- num_beams=2 # Reduced from default 4 for speed
101
- )
102
- return summary[0]['summary_text']
103
- except Exception as e:
104
- return f"Error summarizing chunk: {str(e)}"
105
-
106
- def process_pdf(self, pdf_file, summary_type: str) -> Tuple[str, str, str]:
107
- """Main function to process PDF and generate summary"""
108
- try:
109
- # Extract text from PDF
110
- raw_text = self.extract_text_from_pdf(pdf_file)
111
-
112
- if not raw_text.strip():
113
- return "❌ Error: No text could be extracted from the PDF.", "", ""
114
-
115
- # Clean the text
116
- cleaned_text = self.clean_text(raw_text)
117
-
118
- # Calculate text statistics
119
- word_count = len(cleaned_text.split())
120
- char_count = len(cleaned_text)
121
-
122
- if word_count < 50:
123
- return "��� Error: PDF contains too little text to summarize.", "", ""
124
-
125
- # Chunk the text for processing
126
- chunks = self.chunk_text(cleaned_text)
127
-
128
- # Determine summary parameters based on type (optimized for speed)
129
- if summary_type == "Brief (Quick)":
130
- max_len, min_len = 60, 20
131
- elif summary_type == "Detailed":
132
- max_len, min_len = 100, 40
133
- else: # Comprehensive
134
- max_len, min_len = 150, 60
135
-
136
- # Summarize each chunk (with progress tracking)
137
- chunk_summaries = []
138
- for i, chunk in enumerate(chunks):
139
- print(f"Processing chunk {i+1}/{len(chunks)}")
140
- summary = self.summarize_chunk(chunk, max_len, min_len)
141
- chunk_summaries.append(summary)
142
-
143
- # Combine summaries
144
- combined_summary = " ".join(chunk_summaries)
145
-
146
- # Skip final summarization for speed if we have few chunks
147
- if len(chunks) <= 2:
148
- final_summary = combined_summary
149
- else:
150
- # Quick final summary for multiple chunks
151
- final_summary = self.summarize_chunk(
152
- combined_summary,
153
- max_length=min(200, max_len * 1.5),
154
- min_length=min_len
155
- )
156
-
157
- # Create statistics
158
- summary_stats = f"""
159
- 📊 **Document Statistics:**
160
- - Original word count: {word_count:,}
161
- - Original character count: {char_count:,}
162
- - Pages processed: {len(chunks)}
163
- - Summary word count: {len(final_summary.split()):,}
164
- - Compression ratio: {word_count / len(final_summary.split()):.1f}:1
165
- """
166
-
167
- return final_summary, summary_stats, "✅ Summary generated successfully!"
168
-
169
- except Exception as e:
170
- return f"❌ Error processing PDF: {str(e)}", "", ""
171
-
172
- # Initialize the summarizer
173
- pdf_summarizer = PDFSummarizer()
174
-
175
- def summarize_pdf_interface(pdf_file, summary_type):
176
- """Gradio interface function"""
177
- if pdf_file is None:
178
- return "❌ Please upload a PDF file.", "", ""
179
-
180
- try:
181
- # Read the uploaded file - pdf_file is already the file path
182
- with open(pdf_file, 'rb') as f:
183
- pdf_content = f.read()
184
-
185
- # Process the PDF
186
- summary, stats, status = pdf_summarizer.process_pdf(pdf_content, summary_type)
187
-
188
- return summary, stats, status
189
-
190
- except Exception as e:
191
- return f"❌ Error: {str(e)}", "", ""
192
-
193
- # Create Gradio interface
194
- def create_interface():
195
- with gr.Blocks(
196
- title="📄 AI PDF Summarizer",
197
- theme=gr.themes.Soft(),
198
- css="""
199
- .gradio-container {
200
- max-width: 1200px !important;
201
- }
202
- .summary-box {
203
- border-left: 4px solid #2196F3;
204
- padding: 16px;
205
- background-color: #f8f9fa;
206
- }
207
- """
208
- ) as interface:
209
-
210
- gr.Markdown("""
211
- # 📄 AI-Powered PDF Summarizer
212
-
213
- Upload any PDF document and get an intelligent summary in seconds!
214
- Perfect for research papers, reports, articles, and books.
215
-
216
- **Features:**
217
- - ⚡ Fast processing with BART model
218
- - 📊 Document statistics
219
- - 🎯 Multiple summary lengths
220
- - 🔍 Smart text chunking
221
- """)
222
-
223
- with gr.Row():
224
- with gr.Column(scale=1):
225
- pdf_input = gr.File(
226
- label="📁 Upload PDF File",
227
- file_types=[".pdf"],
228
- type="filepath"
229
- )
230
-
231
- summary_type = gr.Radio(
232
- choices=["Brief (Quick)", "Detailed", "Comprehensive"],
233
- value="Detailed",
234
- label="📏 Summary Length",
235
- info="Choose how detailed you want the summary to be"
236
- )
237
-
238
- summarize_btn = gr.Button(
239
- "🚀 Generate Summary",
240
- variant="primary",
241
- size="lg"
242
- )
243
-
244
- status_output = gr.Textbox(
245
- label="📋 Status",
246
- interactive=False,
247
- max_lines=2
248
- )
249
-
250
- with gr.Column(scale=2):
251
- summary_output = gr.Textbox(
252
- label="📝 Generated Summary",
253
- lines=15,
254
- max_lines=20,
255
- interactive=False,
256
- elem_classes=["summary-box"]
257
- )
258
-
259
- stats_output = gr.Markdown(
260
- label="📊 Document Statistics",
261
- value="Upload a PDF to see statistics"
262
- )
263
-
264
- # Examples section
265
- gr.Markdown("""
266
- ## 💡 Tips for Best Results:
267
-
268
- - **File Quality**: Ensure your PDF has selectable text (not just images)
269
- - **Length**: Works best with documents between 500-10,000 words
270
- - **Language**: Optimized for English content
271
- - **Format**: Clean, well-formatted PDFs produce better summaries
272
-
273
- ## 🔧 Technical Details:
274
- - **Model**: Facebook BART-Large-CNN (state-of-the-art summarization)
275
- - **Processing**: Smart text chunking with overlap prevention
276
- - **Speed**: GPU-accelerated when available
277
- """)
278
-
279
- # Connect the button to the function
280
- summarize_btn.click(
281
- fn=summarize_pdf_interface,
282
- inputs=[pdf_input, summary_type],
283
- outputs=[summary_output, stats_output, status_output]
284
- )
285
-
286
- # Auto-process when file is uploaded
287
- pdf_input.change(
288
- fn=summarize_pdf_interface,
289
- inputs=[pdf_input, summary_type],
290
- outputs=[summary_output, stats_output, status_output]
291
- )
292
-
293
- return interface
294
-
295
- # Launch the application
296
- if __name__ == "__main__":
297
- interface = create_interface()
298
- interface.launch()