Pranjal12345 commited on
Commit
d3f3a6b
1 Parent(s): ec4fb83

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +62 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import re
4
+ import os
5
+ import fitz
6
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
7
+
8
+ tokenizer = AutoTokenizer.from_pretrained("potsawee/t5-large-generation-squad-QuestionAnswer")
9
+ model = AutoModelForSeq2SeqLM.from_pretrained("potsawee/t5-large-generation-squad-QuestionAnswer")
10
+
11
+ def extract_text_from_pdf(pdf_file_path):
12
+ doc = fitz.open(pdf_file_path)
13
+ text = ""
14
+ for page in doc:
15
+ text+=page.get_text()
16
+
17
+ return text
18
+
19
+ def generate_question_answer_pairs(pdf_file):
20
+ if pdf_file is None:
21
+ return "Please upload a PDF file"
22
+
23
+ d = {'Question':[],'Answer':[]}
24
+ df = pd.DataFrame(data=d)
25
+
26
+ pdf_text = extract_text_from_pdf(pdf_file.name)
27
+
28
+ sentences = re.split(r'(?<=[.!?])', pdf_text)
29
+ question_answer_pairs = []
30
+
31
+ for sentence in sentences:
32
+ input_ids = tokenizer.encode(sentence, return_tensors="pt")
33
+ outputs = model.generate(input_ids, max_length=100, num_return_sequences=1)
34
+ question_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
35
+ question_answer_pairs.append(question_answer)
36
+
37
+ result = ''
38
+
39
+ for question_answer in question_answer_pairs:
40
+ qa_parts = question_answer.split("?")
41
+ if len(qa_parts) >= 2:
42
+ question_part = qa_parts[0] + "?"
43
+ answer_part = qa_parts[1].strip()
44
+ new_data = {'Question': [question_part], 'Answer': [answer_part]}
45
+ df = pd.concat([df, pd.DataFrame(new_data)], ignore_index=True)
46
+ result += f"Question: {question_part}\nAnswer: {answer_part}\n\n"
47
+
48
+ df.to_csv("QAPairs.csv")
49
+ return result, "QAPairs.csv"
50
+
51
+ title = "Question-Answer Pairs Generation"
52
+ input_file = gr.File(label="Upload a PDF file")
53
+ output_file = gr.File(label="Download as csv")
54
+ output_text = gr.Textbox()
55
+
56
+ interface = gr.Interface(
57
+ fn=generate_question_answer_pairs,
58
+ inputs=input_file,
59
+ outputs=[output_text, output_file],
60
+ title=title,
61
+ )
62
+ interface.launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ torch
2
+ transformers
3
+ pymupdf