prat1003 commited on
Commit
ca5f6c8
Β·
verified Β·
1 Parent(s): d74a6a8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -19
app.py CHANGED
@@ -8,18 +8,27 @@ from pdf2image import convert_from_path
8
  import easyocr
9
  from PyPDF2 import PdfReader
10
  from transformers import pipeline
 
11
 
12
  # -----------------------------
13
  # Initialize OCR and Transformers
14
  # -----------------------------
15
  reader = easyocr.Reader(['en'])
16
 
 
17
  qg_pipeline = pipeline(
18
  "text2text-generation",
19
  model="valhalla/t5-small-qg-prepend",
20
  tokenizer="t5-small"
21
  )
22
 
 
 
 
 
 
 
 
23
  # -----------------------------
24
  # Extract text from selectable PDFs
25
  # -----------------------------
@@ -36,7 +45,6 @@ def extract_text_from_pdf(file_path):
36
  # Extract text from scanned PDFs using EasyOCR
37
  # -----------------------------
38
  def extract_text_from_scanned_pdf(file_path):
39
- # Reduce DPI for faster processing
40
  pages = convert_from_path(file_path, dpi=150)
41
  text = ""
42
  for page in pages:
@@ -48,6 +56,25 @@ def extract_text_from_scanned_pdf(file_path):
48
  print("OCR error on page:", e)
49
  return text.strip()
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  # -----------------------------
52
  # Main processing function
53
  # -----------------------------
@@ -69,29 +96,32 @@ def process_pdf(pdf_file):
69
  if not extracted_text.strip():
70
  return "❌ Could not extract text. Make sure the PDF has readable content."
71
 
72
- # Step 3: Generate questions with beam search (3 questions)
73
- prompt = "generate questions: " + extracted_text[:1000] # limit to first 1000 chars
74
- questions_output = qg_pipeline(
75
- prompt,
76
- max_length=128,
77
- num_beams=3, # beam search
78
- num_return_sequences=3
79
- )
80
 
81
- # Step 4: Build question list
 
 
 
 
82
  question_list = []
83
- for q in questions_output:
 
 
 
 
84
  question_list.append({
85
- "questiontext": q["generated_text"],
86
  "questiontype": "single_select",
87
  "marks": 10,
88
  "options": [
89
- {"optiontext": "Option 1", "score": "10"},
90
- {"optiontext": "Option 2", "score": "0"}
91
  ]
92
  })
93
 
94
- # Step 5: Build <questiondata> structure
95
  data = {
96
  "title": "Certification Title",
97
  "totalmarks": "50",
@@ -104,8 +134,8 @@ def process_pdf(pdf_file):
104
  "maxattempts": 3
105
  }
106
 
107
- # Step 6: Wrap JSON in XML CDATA
108
- xml_output = "<questiondata><![CDATA[" + json.dumps(data) + "]]></questiondata>"
109
  return xml_output
110
 
111
  # -----------------------------
@@ -115,8 +145,8 @@ iface = gr.Interface(
115
  fn=process_pdf,
116
  inputs=gr.File(label="πŸ“„ Upload your PDF"),
117
  outputs="text",
118
- title="PDF to Question Generator (with OCR)",
119
- description="Uploads a PDF, extracts text (or OCR for scanned PDFs), and generates <questiondata> XML for quizzes."
120
  )
121
 
122
  iface.launch()
 
8
  import easyocr
9
  from PyPDF2 import PdfReader
10
  from transformers import pipeline
11
+ import random
12
 
13
  # -----------------------------
14
  # Initialize OCR and Transformers
15
  # -----------------------------
16
  reader = easyocr.Reader(['en'])
17
 
18
+ # Question generation model
19
  qg_pipeline = pipeline(
20
  "text2text-generation",
21
  model="valhalla/t5-small-qg-prepend",
22
  tokenizer="t5-small"
23
  )
24
 
25
+ # Question-answer generation model
26
+ qa_pipeline = pipeline(
27
+ "text2text-generation",
28
+ model="valhalla/t5-small-qa-qg-hl",
29
+ tokenizer="t5-small"
30
+ )
31
+
32
  # -----------------------------
33
  # Extract text from selectable PDFs
34
  # -----------------------------
 
45
  # Extract text from scanned PDFs using EasyOCR
46
  # -----------------------------
47
  def extract_text_from_scanned_pdf(file_path):
 
48
  pages = convert_from_path(file_path, dpi=150)
49
  text = ""
50
  for page in pages:
 
56
  print("OCR error on page:", e)
57
  return text.strip()
58
 
59
+ # -----------------------------
60
+ # Generate dummy options
61
+ # -----------------------------
62
+ def generate_options(correct_answer):
63
+ options = [correct_answer]
64
+ dummy_opts = [
65
+ "None of the above",
66
+ "All of the above",
67
+ "Not mentioned",
68
+ "Cannot be determined",
69
+ "Irrelevant information"
70
+ ]
71
+ while len(options) < 4:
72
+ opt = random.choice(dummy_opts)
73
+ if opt not in options:
74
+ options.append(opt)
75
+ random.shuffle(options)
76
+ return options
77
+
78
  # -----------------------------
79
  # Main processing function
80
  # -----------------------------
 
96
  if not extracted_text.strip():
97
  return "❌ Could not extract text. Make sure the PDF has readable content."
98
 
99
+ # Step 3: Generate questions
100
+ prompt_q = "generate questions: " + extracted_text[:1000]
101
+ questions_output = qg_pipeline(prompt_q, max_length=128, num_beams=3, num_return_sequences=3)
 
 
 
 
 
102
 
103
+ # Step 4: Generate answers
104
+ prompt_a = "answer questions: " + extracted_text[:1000]
105
+ answers_output = qa_pipeline(prompt_a, max_length=64, num_beams=3, num_return_sequences=3)
106
+
107
+ # Step 5: Build question list
108
  question_list = []
109
+ for i, q in enumerate(questions_output):
110
+ question = q["generated_text"]
111
+ correct_answer = answers_output[i]["generated_text"] if i < len(answers_output) else "N/A"
112
+
113
+ options = generate_options(correct_answer)
114
  question_list.append({
115
+ "questiontext": question,
116
  "questiontype": "single_select",
117
  "marks": 10,
118
  "options": [
119
+ {"optiontext": opt, "score": "10" if opt == correct_answer else "0"}
120
+ for opt in options
121
  ]
122
  })
123
 
124
+ # Step 6: Build <questiondata> structure
125
  data = {
126
  "title": "Certification Title",
127
  "totalmarks": "50",
 
134
  "maxattempts": 3
135
  }
136
 
137
+ # Step 7: Wrap JSON in XML CDATA
138
+ xml_output = "<questiondata><![CDATA[" + json.dumps(data, indent=2) + "]]></questiondata>"
139
  return xml_output
140
 
141
  # -----------------------------
 
145
  fn=process_pdf,
146
  inputs=gr.File(label="πŸ“„ Upload your PDF"),
147
  outputs="text",
148
+ title="PDF β†’ Question & Answer Generator (with OCR)",
149
+ description="Uploads a PDF, extracts text (or OCR for scanned PDFs), and generates XML with questions + answers."
150
  )
151
 
152
  iface.launch()