Spaces:

heerjtdev
/

example

Running

App Files Files Community

heerjtdev commited on 2 days ago

Commit

7bf9c65

verified ·

1 Parent(s): 6b41319

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -39

app.py CHANGED Viewed

@@ -2,32 +2,75 @@ import gradio as gr
 import PyPDF2
 import re
 import json
-from typing import List, Dict, Tuple
-from transformers import pipeline
 import tempfile
 import os
-# Initialize the question generation pipeline using a small CPU-friendly model
 print("Loading models... This may take a minute on first run.")
-qa_generator = pipeline(
-    "text2text-generation",
-    model="valhalla/t5-small-qg-hl",
-    tokenizer="valhalla/t5-small-qg-hl",
-    device=-1  # Force CPU
-)
 def extract_text_from_pdf(pdf_file) -> str:
     """Extract text from uploaded PDF file."""
     text = ""
     try:
-        # Handle both file path and file object
         if isinstance(pdf_file, str):
             pdf_reader = PyPDF2.PdfReader(pdf_file)
         else:
             pdf_reader = PyPDF2.PdfReader(pdf_file)
         for page in pdf_reader.pages:
-            text += page.extract_text() + "\n"
     except Exception as e:
         return f"Error reading PDF: {str(e)}"
@@ -74,44 +117,32 @@ def generate_qa_pairs(chunk: str, num_questions: int = 2) -> List[Dict[str, str]
     flashcards = []
     # Skip chunks that are too short
-    if len(chunk.split()) < 20:
         return []
     try:
-        # Generate highlight format for T5 question generation
-        # We'll create simple highlight by taking key sentences
-        sentences = chunk.split('. ')
-        if len(sentences) < 2:
             return []
-        # Generate questions for different parts of the chunk
         for i in range(min(num_questions, len(sentences))):
-            # Create highlight context
-            highlight = sentences[i]
-            context = chunk
-            # Format for T5: "generate question: <hl> highlight <hl> context"
-            input_text = f"generate question: <hl> {highlight} <hl> {context}"
-            # Generate question
-            outputs = qa_generator(
-                input_text,
-                max_length=128,
-                num_return_sequences=1,
-                do_sample=True,
-                temperature=0.7
-            )
-            question = outputs[0]['generated_text'].strip()
-            # Clean up question
-            question = re.sub(r'^(question:|q:)', '', question, flags=re.IGNORECASE).strip()
-            if question and len(question) > 10:
                 flashcards.append({
                     "question": question,
-                    "answer": highlight.strip(),
-                    "context": context[:200] + "..." if len(context) > 200 else context
                 })
     except Exception as e:
@@ -305,12 +336,18 @@ with gr.Blocks(css=custom_css, title="PDF to Flashcards") as demo:
             gr.Markdown("*Raw JSON data for custom applications*")
     # Event handlers
     process_btn.click(
         fn=process_pdf,
         inputs=[pdf_input, questions_per_chunk, max_chunks],
         outputs=[status_text, csv_output, json_output]
     ).then(
-        fn=lambda x: x if not isinstance(x, str) or not x.startswith("📄") else gr.update(),
         inputs=status_text,
         outputs=output_display
     )

 import PyPDF2
 import re
 import json
+from typing import List, Dict
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+import torch
 import tempfile
 import os
+# Initialize the model and tokenizer directly
 print("Loading models... This may take a minute on first run.")
+model_name = "valhalla/t5-small-qg-hl"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+# Set to evaluation mode and CPU
+model.eval()
+device = torch.device("cpu")
+model.to(device)
+def generate_questions(context: str, answer: str, max_length: int = 128) -> str:
+    """Generate a question using T5 model."""
+    try:
+        # Format: "generate question: <hl> answer <hl> context"
+        input_text = f"generate question: <hl> {answer} <hl> {context}"
+        # Tokenize
+        inputs = tokenizer(
+            input_text,
+            return_tensors="pt",
+            max_length=512,
+            truncation=True,
+            padding=True
+        ).to(device)
+        # Generate
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_length=max_length,
+                num_beams=4,
+                early_stopping=True,
+                do_sample=True,
+                temperature=0.7
+            )
+        # Decode
+        question = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Clean up
+        question = re.sub(r'^(question:|q:)', '', question, flags=re.IGNORECASE).strip()
+        return question if len(question) > 10 else ""
+    except Exception as e:
+        print(f"Error generating question: {e}")
+        return ""
 def extract_text_from_pdf(pdf_file) -> str:
     """Extract text from uploaded PDF file."""
     text = ""
     try:
         if isinstance(pdf_file, str):
             pdf_reader = PyPDF2.PdfReader(pdf_file)
         else:
             pdf_reader = PyPDF2.PdfReader(pdf_file)
         for page in pdf_reader.pages:
+            page_text = page.extract_text()
+            if page_text:
+                text += page_text + "\n"
     except Exception as e:
         return f"Error reading PDF: {str(e)}"
     flashcards = []
     # Skip chunks that are too short
+    words = chunk.split()
+    if len(words) < 20:
         return []
     try:
+        # Split into sentences to use as answers
+        sentences = [s.strip() for s in chunk.split('. ') if len(s.strip()) > 20]
+        if len(sentences) < 1:
             return []
+        # Generate questions for different sentences
         for i in range(min(num_questions, len(sentences))):
+            answer = sentences[i]
+            # Skip very short answers
+            if len(answer.split()) < 3:
+                continue
+            question = generate_questions(chunk, answer)
+            if question and question != answer:  # Make sure they're different
                 flashcards.append({
                     "question": question,
+                    "answer": answer,
+                    "context": chunk[:200] + "..." if len(chunk) > 200 else chunk
                 })
     except Exception as e:
             gr.Markdown("*Raw JSON data for custom applications*")
     # Event handlers
+    def update_display(status):
+        """Update display when processing is done."""
+        if status and not status.startswith(("📄", "🧹", "✂️", "🎴", "✅")):
+            return status
+        return gr.update()
     process_btn.click(
         fn=process_pdf,
         inputs=[pdf_input, questions_per_chunk, max_chunks],
         outputs=[status_text, csv_output, json_output]
     ).then(
+        fn=update_display,
         inputs=status_text,
         outputs=output_display
     )