Spaces:

Kiruthick18
/

PDF_Summarizer

Running

App Files Files Community

Kiruthick18 commited on Aug 20

Commit

8ed6134

verified ·

1 Parent(s): 2d9c959

Delete app.py

Browse files

Files changed (1) hide show

app.py +0 -298

app.py DELETED Viewed

@@ -1,298 +0,0 @@
-import gradio as gr
-import PyPDF2
-import io
-from transformers import pipeline, AutoTokenizer
-import torch
-import re
-from typing import List, Tuple
-import warnings
-warnings.filterwarnings("ignore")
-class PDFSummarizer:
-    def __init__(self):
-        # Use a much faster, lighter model for summarization
-        self.model_name = "sshleifer/distilbart-cnn-12-6"  # Much faster than BART-large
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        print(f"Using device: {self.device}")
-        try:
-            # Initialize the summarization pipeline with optimizations
-            self.summarizer = pipeline(
-                "summarization",
-                model=self.model_name,
-                device=0 if self.device == "cuda" else -1,
-                framework="pt",
-                model_kwargs={"torch_dtype": torch.float16 if self.device == "cuda" else torch.float32}
-            )
-            # Initialize tokenizer for length calculations
-            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-            print("Model loaded successfully")
-        except Exception as e:
-            print(f"Error loading model: {e}")
-            # Fallback to an even faster model
-            self.model_name = "facebook/bart-large-cnn"
-            self.summarizer = pipeline("summarization", model=self.model_name, device=-1)
-            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-            print("Fallback model loaded")
-    def extract_text_from_pdf(self, pdf_file) -> str:
-        """Extract text content from PDF file"""
-        try:
-            pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
-            text = ""
-            for page_num, page in enumerate(pdf_reader.pages):
-                page_text = page.extract_text()
-                if page_text.strip():
-                    text += f"\n--- Page {page_num + 1} ---\n"
-                    text += page_text
-            return text.strip()
-        except Exception as e:
-            raise Exception(f"Error extracting text from PDF: {str(e)}")
-    def clean_text(self, text: str) -> str:
-        """Clean and preprocess text"""
-        # Remove extra whitespaces and newlines
-        text = re.sub(r'\s+', ' ', text)
-        # Remove special characters but keep punctuation
-        text = re.sub(r'[^\w\s.,!?;:()\-"]', ' ', text)
-        # Remove page markers
-        text = re.sub(r'--- Page \d+ ---', '', text)
-        return text.strip()
-    def chunk_text(self, text: str, max_chunk_length: int = 512) -> List[str]:
-        """Split text into smaller, more manageable chunks for faster processing"""
-        sentences = text.split('. ')
-        chunks = []
-        current_chunk = ""
-        for sentence in sentences:
-            # Check if adding this sentence would exceed the limit
-            potential_chunk = current_chunk + sentence + ". "
-            # Use faster length estimation
-            if len(potential_chunk.split()) <= max_chunk_length:
-                current_chunk = potential_chunk
-            else:
-                if current_chunk:
-                    chunks.append(current_chunk.strip())
-                current_chunk = sentence + ". "
-        if current_chunk:
-            chunks.append(current_chunk.strip())
-        # Limit number of chunks for speed
-        return chunks[:5]  # Process max 5 chunks for speed
-    def summarize_chunk(self, chunk: str, max_length: int = 100, min_length: int = 30) -> str:
-        """Summarize a single chunk of text with speed optimizations"""
-        try:
-            # Speed optimizations
-            summary = self.summarizer(
-                chunk,
-                max_length=max_length,
-                min_length=min_length,
-                do_sample=False,
-                truncation=True,
-                early_stopping=True,
-                num_beams=2  # Reduced from default 4 for speed
-            )
-            return summary[0]['summary_text']
-        except Exception as e:
-            return f"Error summarizing chunk: {str(e)}"
-    def process_pdf(self, pdf_file, summary_type: str) -> Tuple[str, str, str]:
-        """Main function to process PDF and generate summary"""
-        try:
-            # Extract text from PDF
-            raw_text = self.extract_text_from_pdf(pdf_file)
-            if not raw_text.strip():
-                return "❌ Error: No text could be extracted from the PDF.", "", ""
-            # Clean the text
-            cleaned_text = self.clean_text(raw_text)
-            # Calculate text statistics
-            word_count = len(cleaned_text.split())
-            char_count = len(cleaned_text)
-            if word_count < 50:
-                return "��� Error: PDF contains too little text to summarize.", "", ""
-            # Chunk the text for processing
-            chunks = self.chunk_text(cleaned_text)
-            # Determine summary parameters based on type (optimized for speed)
-            if summary_type == "Brief (Quick)":
-                max_len, min_len = 60, 20
-            elif summary_type == "Detailed":
-                max_len, min_len = 100, 40
-            else:  # Comprehensive
-                max_len, min_len = 150, 60
-            # Summarize each chunk (with progress tracking)
-            chunk_summaries = []
-            for i, chunk in enumerate(chunks):
-                print(f"Processing chunk {i+1}/{len(chunks)}")
-                summary = self.summarize_chunk(chunk, max_len, min_len)
-                chunk_summaries.append(summary)
-            # Combine summaries
-            combined_summary = " ".join(chunk_summaries)
-            # Skip final summarization for speed if we have few chunks
-            if len(chunks) <= 2:
-                final_summary = combined_summary
-            else:
-                # Quick final summary for multiple chunks
-                final_summary = self.summarize_chunk(
-                    combined_summary,
-                    max_length=min(200, max_len * 1.5),
-                    min_length=min_len
-                )
-            # Create statistics
-            summary_stats = f"""
-📊 **Document Statistics:**
-- Original word count: {word_count:,}
-- Original character count: {char_count:,}
-- Pages processed: {len(chunks)}
-- Summary word count: {len(final_summary.split()):,}
-- Compression ratio: {word_count / len(final_summary.split()):.1f}:1
-            """
-            return final_summary, summary_stats, "✅ Summary generated successfully!"
-        except Exception as e:
-            return f"❌ Error processing PDF: {str(e)}", "", ""
-# Initialize the summarizer
-pdf_summarizer = PDFSummarizer()
-def summarize_pdf_interface(pdf_file, summary_type):
-    """Gradio interface function"""
-    if pdf_file is None:
-        return "❌ Please upload a PDF file.", "", ""
-    try:
-        # Read the uploaded file - pdf_file is already the file path
-        with open(pdf_file, 'rb') as f:
-            pdf_content = f.read()
-        # Process the PDF
-        summary, stats, status = pdf_summarizer.process_pdf(pdf_content, summary_type)
-        return summary, stats, status
-    except Exception as e:
-        return f"❌ Error: {str(e)}", "", ""
-# Create Gradio interface
-def create_interface():
-    with gr.Blocks(
-        title="📄 AI PDF Summarizer",
-        theme=gr.themes.Soft(),
-        css="""
-        .gradio-container {
-            max-width: 1200px !important;
-        }
-        .summary-box {
-            border-left: 4px solid #2196F3;
-            padding: 16px;
-            background-color: #f8f9fa;
-        }
-        """
-    ) as interface:
-        gr.Markdown("""
-        # 📄 AI-Powered PDF Summarizer
-        Upload any PDF document and get an intelligent summary in seconds!
-        Perfect for research papers, reports, articles, and books.
-        **Features:**
-        - ⚡ Fast processing with BART model
-        - 📊 Document statistics
-        - 🎯 Multiple summary lengths
-        - 🔍 Smart text chunking
-        """)
-        with gr.Row():
-            with gr.Column(scale=1):
-                pdf_input = gr.File(
-                    label="📁 Upload PDF File",
-                    file_types=[".pdf"],
-                    type="filepath"
-                )
-                summary_type = gr.Radio(
-                    choices=["Brief (Quick)", "Detailed", "Comprehensive"],
-                    value="Detailed",
-                    label="📏 Summary Length",
-                    info="Choose how detailed you want the summary to be"
-                )
-                summarize_btn = gr.Button(
-                    "🚀 Generate Summary",
-                    variant="primary",
-                    size="lg"
-                )
-                status_output = gr.Textbox(
-                    label="📋 Status",
-                    interactive=False,
-                    max_lines=2
-                )
-            with gr.Column(scale=2):
-                summary_output = gr.Textbox(
-                    label="📝 Generated Summary",
-                    lines=15,
-                    max_lines=20,
-                    interactive=False,
-                    elem_classes=["summary-box"]
-                )
-                stats_output = gr.Markdown(
-                    label="📊 Document Statistics",
-                    value="Upload a PDF to see statistics"
-                )
-        # Examples section
-        gr.Markdown("""
-        ## 💡 Tips for Best Results:
-        - **File Quality**: Ensure your PDF has selectable text (not just images)
-        - **Length**: Works best with documents between 500-10,000 words
-        - **Language**: Optimized for English content
-        - **Format**: Clean, well-formatted PDFs produce better summaries
-        ## 🔧 Technical Details:
-        - **Model**: Facebook BART-Large-CNN (state-of-the-art summarization)
-        - **Processing**: Smart text chunking with overlap prevention
-        - **Speed**: GPU-accelerated when available
-        """)
-        # Connect the button to the function
-        summarize_btn.click(
-            fn=summarize_pdf_interface,
-            inputs=[pdf_input, summary_type],
-            outputs=[summary_output, stats_output, status_output]
-        )
-        # Auto-process when file is uploaded
-        pdf_input.change(
-            fn=summarize_pdf_interface,
-            inputs=[pdf_input, summary_type],
-            outputs=[summary_output, stats_output, status_output]
-        )
-    return interface
-# Launch the application
-if __name__ == "__main__":
-    interface = create_interface()
-    interface.launch()