Spaces:

tejovanth
/

exampletwo

Sleeping

App Files Files Community

tejovanth commited on Apr 17, 2025

Commit

f738250

verified ·

1 Parent(s): 6441138

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -36

app.py CHANGED Viewed

@@ -1,82 +1,105 @@
 import gradio as gr
-from transformers import pipeline
 import fitz  # PyMuPDF for PDFs
-import pytesseract  # For OCR (images)
-from PIL import Image
 import io
-# Load summarization model
-summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
-# Function to extract text from different file types
 def extract_text(file_bytes):
     try:
-        # file_bytes is already a bytes object
         header = file_bytes[:4]
-        # Determine file type based on magic numbers
         if header.startswith(b'%PDF'):
             doc = fitz.open(stream=file_bytes, filetype="pdf")
             text = ""
-            for page in doc:
-                text += page.get_text()
             return text
-        elif header.startswith(b'\xFF\xD8') or header.startswith(b'\x89PNG'):
-            # It's an image (JPEG/PNG), use OCR
-            image = Image.open(io.BytesIO(file_bytes))
-            return pytesseract.image_to_string(image)
         else:
-            # Try reading as plain text
             try:
                 return file_bytes.decode("utf-8")
             except UnicodeDecodeError:
-                return "❌ Unsupported file format or corrupted file."
     except Exception as e:
         return f"❌ Error reading file: {str(e)}"
-# Function to chunk text into smaller pieces
-def chunk_text(text, chunk_size=4000):
     return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
 # Summarize the extracted text
 def summarize_file(file_bytes):
     text = extract_text(file_bytes)
     if not text or len(text.strip()) == 0:
         return "❌ No text found in the uploaded file."
-    # Ensure at least 300,000 characters can be processed (no truncation)
     if len(text) > 300000:
-        text = text[:300000]  # Optional: cap at 300,000 if desired, but can be removed for larger inputs
-    # Chunk the text into 4,000-character segments
-    chunks = chunk_text(text, chunk_size=4000)
     if not chunks:
         return "❌ No valid chunks to summarize."
-    # Summarize each chunk
     summaries = []
-    for i, chunk in enumerate(chunks):
         try:
-            summary = summarizer(chunk, max_length=150, min_length=40, do_sample=False)
-            summaries.append(f"**Chunk {i+1} Summary**:\n{summary[0]['summary_text']}")
         except Exception as e:
-            summaries.append(f"**Chunk {i+1} Summary**: ❌ Error summarizing chunk: {str(e)}")
-    # Combine summaries
     combined_summary = "\n\n".join(summaries)
-    total_chars = len(text)
-    return f"**Total Characters Processed**: {total_chars}\n\n**Summaries**:\n{combined_summary}"
 # Gradio UI
 demo = gr.Interface(
     fn=summarize_file,
-    inputs=gr.File(label="📄 Upload Notes (PDF, TXT, or Handwritten Image)", type="binary"),
     outputs=gr.Textbox(label="📝 Summarized Notes"),
-    title="📚 Note Summarizer",
-    description="Upload academic notes in PDF, TXT, or image format (supports at least 300,000 characters). This app extracts and summarizes the content using a Hugging Face transformer model."
 )
 # Launch the interface
@@ -84,4 +107,3 @@ if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 import fitz  # PyMuPDF for PDFs
 import io
+import torch
+from transformers import pipeline
+from tqdm import tqdm
+import time
+# Check for GPU (mandatory for 5–10s target)
+device = 0 if torch.cuda.is_available() else -1
+if device == -1:
+    print("⚠️ Warning: GPU not detected. 5–10s target requires a GPU. Expect slower performance.")
+# Load summarization model (distilbart-cnn-6-6 is faster)
+summarizer = pipeline(
+    "summarization",
+    model="sshleifer/distilbart-cnn-6-6",
+    device=device,
+    torch_dtype=torch.float16 if device == 0 else torch.float32  # Quantize on GPU
+)
+# Function to extract text from PDFs or text files (skip images for speed)
 def extract_text(file_bytes):
     try:
         header = file_bytes[:4]
         if header.startswith(b'%PDF'):
             doc = fitz.open(stream=file_bytes, filetype="pdf")
             text = ""
+            for page in tqdm(doc, desc="Extracting PDF pages", disable=True):  # Silent progress
+                text += page.get_text("text", flags=fitz.TEXTFLAGS_TEXT)  # Fast text-only extraction
+            doc.close()
             return text
         else:
             try:
                 return file_bytes.decode("utf-8")
             except UnicodeDecodeError:
+                return "❌ Unsupported file format (images not supported for speed)."
     except Exception as e:
         return f"❌ Error reading file: {str(e)}"
+# Function to chunk text
+def chunk_text(text, chunk_size=10000):
     return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
 # Summarize the extracted text
 def summarize_file(file_bytes):
+    start_time = time.time()
     text = extract_text(file_bytes)
     if not text or len(text.strip()) == 0:
         return "❌ No text found in the uploaded file."
+    # Cap at 300,000 characters (optional, can remove for larger inputs)
     if len(text) > 300000:
+        text = text[:300000]
+    # Chunk into 10,000-character segments (~30 chunks for 300,000 chars)
+    chunks = chunk_text(text, chunk_size=10000)
     if not chunks:
         return "❌ No valid chunks to summarize."
+    # Summarize with batch processing
     summaries = []
+    batch_size = 8 if device == 0 else 2  # Large batch on GPU, small on CPU
+    max_chunks = 15  # Limit to ~150,000 chars for 5–10s (adjust as needed)
+    for i in range(0, min(len(chunks), max_chunks), batch_size):
+        if time.time() - start_time > 8:  # Stop early if nearing 10s
+            summaries.append("⚠️ Stopped early to meet 5–10s target. Not all text summarized.")
+            break
+        batch = chunks[i:i + batch_size]
         try:
+            batch_summaries = summarizer(
+                batch,
+                max_length=100,  # Shorter summaries for speed
+                min_length=20,
+                do_sample=False,
+                truncation=True,
+                batch_size=batch_size
+            )
+            for j, summary in enumerate(batch_summaries):
+                summaries.append(f"**Chunk {i+j+1} Summary**:\n{summary['summary_text']}")
         except Exception as e:
+            summaries.append(f"**Chunk {i+1} Summary**: ❌ Error: {str(e)}")
+    # Add note if not all chunks processed
+    if len(chunks) > max_chunks:
+        summaries.append(f"⚠️ Only {max_chunks} of {len(chunks)} chunks processed (~{max_chunks*10000} chars). Full processing may take ~12–15s.")
     combined_summary = "\n\n".join(summaries)
+    elapsed_time = time.time() - start_time
+    return f"**Total Characters Processed**: {min(len(text), max_chunks*10000)}\n**Time Taken**: {elapsed_time:.2f}s\n\n**Summaries**:\n{combined_summary}"
 # Gradio UI
 demo = gr.Interface(
     fn=summarize_file,
+    inputs=gr.File(label="📄 Upload Notes (PDF or TXT)", type="binary"),
     outputs=gr.Textbox(label="📝 Summarized Notes"),
+    title="📚 Ultra-Fast Note Summarizer",
+    description="Upload academic notes in PDF or TXT format (supports ~300,000 characters). Optimized for 5–10s runtime using a lightweight model and GPU. Images not supported for speed."
 )
 # Launch the interface
     demo.launch()