File size: 2,477 Bytes
e31e7fa
f219378
 
 
 
 
 
4d70dcc
0c86a34
f219378
 
 
 
 
 
 
 
 
 
 
 
 
 
e31e7fa
 
 
f219378
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e31e7fa
 
f219378
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import gradio as gr
import PyPDF2
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('stopwords')

def extract_text(file):
    """
    This function takes a PDF file and returns the extracted text.
    """
    pdf_file = open(file.name, 'rb')
    read_pdf = PyPDF2.PdfReader(pdf_file)
    num_pages = len(read_pdf.pages)
    text = ""

    for i in range(num_pages):
        page = read_pdf.pages[i]
        text += page.extract_text ()

    return text


def generate_answers(text, question):
    """
    This function takes the extracted text and a question and generates an answer.
    """
    # Tokenize the text and question
    sentences = nltk.sent_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(question.lower())

    # Generate TF-IDF matrix
    vectorizer = TfidfVectorizer(stop_words=stop_words)
    X = vectorizer.fit_transform(sentences)

    # Calculate cosine similarity matrix
    cos_sim_matrix = cosine_similarity(X)

    # Find the sentence with the highest similarity to the question
    max_sim = -1
    max_idx = -1
    for i in range(len(sentences)):
        sim = 0
        for word in words:
            sim += cos_sim_matrix[i][vectorizer.vocabulary_.get(word, 0)]
        if sim > max_sim:
            max_sim = sim
            max_idx = i

    # Return the sentence with the highest similarity as the answer
    if max_idx != -1:
        answer = sentences[max_idx]
    else:
        answer = "I'm sorry, I couldn't find an answer to that question."

    return answer


# Create the Gradio app interface
def app():
    file_input = gr.inputs.File(label="Upload PDF Document")
    output_text = gr.outputs.Textbox(label="Extracted Text")
    question_input = gr.inputs.Textbox(label="Enter a question")
    output_answer = gr.outputs.Textbox(label="Answer")

    def predict(file, question):
        # Extract text from the file
        text = extract_text(file)

        # Generate an answer to the question
        answer = generate_answers(text, question)

        return text, answer

    # Create the interface and run the app
    iface = gr.Interface(fn=predict, inputs=[file_input, question_input], outputs=[output_text, output_answer],
                         title="PDF QA Generator")
    iface.launch()


if __name__ == '__main__':
    app()