Spaces:

prat1003
/

project2

Sleeping

App Files Files Community

prat1003 commited on Oct 14, 2025

Commit

ca5f6c8

verified ·

1 Parent(s): d74a6a8

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -19

app.py CHANGED Viewed

@@ -8,18 +8,27 @@ from pdf2image import convert_from_path
 import easyocr
 from PyPDF2 import PdfReader
 from transformers import pipeline
 # -----------------------------
 # Initialize OCR and Transformers
 # -----------------------------
 reader = easyocr.Reader(['en'])
 qg_pipeline = pipeline(
     "text2text-generation",
     model="valhalla/t5-small-qg-prepend",
     tokenizer="t5-small"
 )
 # -----------------------------
 # Extract text from selectable PDFs
 # -----------------------------
@@ -36,7 +45,6 @@ def extract_text_from_pdf(file_path):
 # Extract text from scanned PDFs using EasyOCR
 # -----------------------------
 def extract_text_from_scanned_pdf(file_path):
-    # Reduce DPI for faster processing
     pages = convert_from_path(file_path, dpi=150)
     text = ""
     for page in pages:
@@ -48,6 +56,25 @@ def extract_text_from_scanned_pdf(file_path):
             print("OCR error on page:", e)
     return text.strip()
 # -----------------------------
 # Main processing function
 # -----------------------------
@@ -69,29 +96,32 @@ def process_pdf(pdf_file):
     if not extracted_text.strip():
         return "❌ Could not extract text. Make sure the PDF has readable content."
-    # Step 3: Generate questions with beam search (3 questions)
-    prompt = "generate questions: " + extracted_text[:1000]  # limit to first 1000 chars
-    questions_output = qg_pipeline(
-        prompt,
-        max_length=128,
-        num_beams=3,            # beam search
-        num_return_sequences=3
-    )
-    # Step 4: Build question list
     question_list = []
-    for q in questions_output:
         question_list.append({
-            "questiontext": q["generated_text"],
             "questiontype": "single_select",
             "marks": 10,
             "options": [
-                {"optiontext": "Option 1", "score": "10"},
-                {"optiontext": "Option 2", "score": "0"}
             ]
         })
-    # Step 5: Build <questiondata> structure
     data = {
         "title": "Certification Title",
         "totalmarks": "50",
@@ -104,8 +134,8 @@ def process_pdf(pdf_file):
         "maxattempts": 3
     }
-    # Step 6: Wrap JSON in XML CDATA
-    xml_output = "<questiondata><![CDATA[" + json.dumps(data) + "]]></questiondata>"
     return xml_output
 # -----------------------------
@@ -115,8 +145,8 @@ iface = gr.Interface(
     fn=process_pdf,
     inputs=gr.File(label="📄 Upload your PDF"),
     outputs="text",
-    title="PDF to Question Generator (with OCR)",
-    description="Uploads a PDF, extracts text (or OCR for scanned PDFs), and generates <questiondata> XML for quizzes."
 )
 iface.launch()

 import easyocr
 from PyPDF2 import PdfReader
 from transformers import pipeline
+import random
 # -----------------------------
 # Initialize OCR and Transformers
 # -----------------------------
 reader = easyocr.Reader(['en'])
+# Question generation model
 qg_pipeline = pipeline(
     "text2text-generation",
     model="valhalla/t5-small-qg-prepend",
     tokenizer="t5-small"
 )
+# Question-answer generation model
+qa_pipeline = pipeline(
+    "text2text-generation",
+    model="valhalla/t5-small-qa-qg-hl",
+    tokenizer="t5-small"
+)
 # -----------------------------
 # Extract text from selectable PDFs
 # -----------------------------
 # Extract text from scanned PDFs using EasyOCR
 # -----------------------------
 def extract_text_from_scanned_pdf(file_path):
     pages = convert_from_path(file_path, dpi=150)
     text = ""
     for page in pages:
             print("OCR error on page:", e)
     return text.strip()
+# -----------------------------
+# Generate dummy options
+# -----------------------------
+def generate_options(correct_answer):
+    options = [correct_answer]
+    dummy_opts = [
+        "None of the above",
+        "All of the above",
+        "Not mentioned",
+        "Cannot be determined",
+        "Irrelevant information"
+    ]
+    while len(options) < 4:
+        opt = random.choice(dummy_opts)
+        if opt not in options:
+            options.append(opt)
+    random.shuffle(options)
+    return options
 # -----------------------------
 # Main processing function
 # -----------------------------
     if not extracted_text.strip():
         return "❌ Could not extract text. Make sure the PDF has readable content."
+    # Step 3: Generate questions
+    prompt_q = "generate questions: " + extracted_text[:1000]
+    questions_output = qg_pipeline(prompt_q, max_length=128, num_beams=3, num_return_sequences=3)
+    # Step 4: Generate answers
+    prompt_a = "answer questions: " + extracted_text[:1000]
+    answers_output = qa_pipeline(prompt_a, max_length=64, num_beams=3, num_return_sequences=3)
+    # Step 5: Build question list
     question_list = []
+    for i, q in enumerate(questions_output):
+        question = q["generated_text"]
+        correct_answer = answers_output[i]["generated_text"] if i < len(answers_output) else "N/A"
+        options = generate_options(correct_answer)
         question_list.append({
+            "questiontext": question,
             "questiontype": "single_select",
             "marks": 10,
             "options": [
+                {"optiontext": opt, "score": "10" if opt == correct_answer else "0"}
+                for opt in options
             ]
         })
+    # Step 6: Build <questiondata> structure
     data = {
         "title": "Certification Title",
         "totalmarks": "50",
         "maxattempts": 3
     }
+    # Step 7: Wrap JSON in XML CDATA
+    xml_output = "<questiondata><![CDATA[" + json.dumps(data, indent=2) + "]]></questiondata>"
     return xml_output
 # -----------------------------
     fn=process_pdf,
     inputs=gr.File(label="📄 Upload your PDF"),
     outputs="text",
+    title="PDF → Question & Answer Generator (with OCR)",
+    description="Uploads a PDF, extracts text (or OCR for scanned PDFs), and generates XML with questions + answers."
 )
 iface.launch()