File size: 2,919 Bytes
e0a45a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import gradio as gr

from transformers import pipeline
from PyPDF2 import PdfReader

generate_question_pipe = pipeline("text2text-generation", model="thangved/t5-generate-question")
qa_pipe = pipeline("question-answering", model="SharKRippeR/QA_T5_small_seq2seq")

MAX_OUTPUT = 3
MAX_INPUT_TOKENS = 256

# Truncate text to 256 tokens
def split_texts(text:str) -> list[str]:
    tokens = text.split(' ') # Split text into tokens

    # If the number of tokens is greater than 256, truncate it
    if len(tokens) > MAX_INPUT_TOKENS:
        tokens = tokens[:MAX_INPUT_TOKENS]

    texts = []

    for i in range(0, len(tokens), MAX_INPUT_TOKENS):
        texts.append(' '.join(tokens[i:i+64]))

    # Join tokens back into text
    return texts

def generate_questions_request(text:str) -> list[str]: # type: ignore
    response = generate_question_pipe(text)

    if response is None:
        return []

    result = []

    for question in response:
        questions = question['generated_text'].split('Question:')[1:] # type: ignore

        for question in questions:
            question = question.strip()
            result.append(question)


    return result

def generate_questions(file):
    if file is None:
        return [''] * (MAX_OUTPUT+1)

    reader = PdfReader(file.name)

    text = ''

    for page in reader.pages:
        text += page.extract_text()

    texts = split_texts(text)

    questions = [text]

    for text in texts:
        questions += generate_questions_request(text)

    i = len(questions)

    while i <= MAX_OUTPUT:
        questions.append('')
        i += 1

    return questions

def generate_answers(context='',q1='', q2='', q3=''):

    answers = []

    for q in [q1, q2, q3]:
        if q == '':
            answers.append('')
            continue

        answer = qa_pipe({
            'question': q,
            'context': context
        })

        answers.append(answer['answer']) # type: ignore

    return answers

with gr.Blocks() as demo:
    gr.Markdown("# PDF to Questions")

    with gr.Row():
        inp = gr.File(label='Select file', file_types=['.pdf'])
        context = gr.Textbox(label='Pdf content', lines=10)

    with gr.Row():
        with gr.Column():
            q1 = gr.Textbox(label='Question 1')
            q2 = gr.Textbox(label='Question 2')
            q3 = gr.Textbox(label='Question 3')

        with gr.Column():
            a1 = gr.Textbox(label='Answer 1')
            a2 = gr.Textbox(label='Answer 2')
            a3 = gr.Textbox(label='Answer 3')

    generate_question_btn = gr.Button('Generate questions')
    generate_answer_btn = gr.Button('Generate answers', variant='primary')

    generate_question_btn.click(fn=generate_questions, inputs=inp, outputs=[context, q1, q2, q3])
    generate_answer_btn.click(fn=generate_answers, inputs=[context, q1, q2, q3], outputs=[a1, a2, a3])

if __name__ == '__main__':
    demo.launch()