File size: 7,877 Bytes
bb7fdc2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
388323e
bb7fdc2
 
388323e
bb7fdc2
 
 
 
 
 
 
388323e
bb7fdc2
 
 
 
 
 
 
 
 
388323e
 
 
 
bb7fdc2
388323e
bb7fdc2
388323e
 
bb7fdc2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
388323e
 
bb7fdc2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
# import os
# import pdfplumber
# import re
# import gradio as gr
# from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
# from io import BytesIO
# import torch

# """
# Extract the text from a section of a PDF file  between 'wanted_section' and 'next_section'.
# Parameters: 
# - path (str): The file path to the PDF file.
# - wanted_section (str): The section to start extracting text from.
# - next_section (str): The section to stop extracting text at.
# Returns:
# - text (str): The extracted text from the specified section range.
# """


# def get_section(path, wanted_section, next_section):
#     print(wanted_section)

#     # Open the PDF file
#     doc = pdfplumber.open(BytesIO(path)) 
#     start_page = []
#     end_page = []

#     # Find the all the pages for the specified sections
#     for page in range(len(doc.pages)):
#         if len(doc.pages[page].search(wanted_section, return_chars=False, case=False)) > 0:
#             start_page.append(page)
#         if len(doc.pages[page].search(next_section, return_chars=False, case=False)) > 0:
#             end_page.append(page)

#     # Extract the text between the start and end page of the wanted section
#     text = []
#     for page_num in range(max(start_page), max(end_page)+1):
#         page = doc.pages[page_num]
#         text.append(page.extract_text())
#     text = " ".join(text)
#     final_text = text.replace("\n", " ")
#     return final_text


# def extract_between(big_string, start_string, end_string):
#     # Use a non-greedy match for content between start_string and end_string
#     pattern = re.escape(start_string) + '(.*?)' + re.escape(end_string)
#     match = re.search(pattern, big_string, re.DOTALL)

#     if match:
#         # Return the content without the start and end strings
#         return match.group(1)
#     else:
#         # Return None if the pattern is not found
#         return None

# def format_section1(section1_text):
#     result_section1_dict = {}

#     result_section1_dict['TOPIC'] = extract_between(section1_text, "Sektor", "EZ-Programm")
#     result_section1_dict['PROGRAM'] = extract_between(section1_text, "Sektor", "EZ-Programm")
#     result_section1_dict['PROJECT DESCRIPTION'] = extract_between(section1_text, "EZ-Programmziel", "Datum der letzten BE")
#     result_section1_dict['PROJECT NAME'] = extract_between(section1_text, "Modul", "Modulziel")
#     result_section1_dict['OBJECTIVE'] = extract_between(section1_text, "Modulziel", "Berichtszeitraum")
#     result_section1_dict['PROGRESS'] = extract_between(section1_text, "Zielerreichung des Moduls", "Massnahme im Zeitplan")
#     result_section1_dict['STATUS'] = extract_between(section1_text, "Massnahme im Zeitplan", "Risikoeinschätzung")
#     result_section1_dict['RECOMMENDATIONS'] = extract_between(section1_text, "Vorschläge zur Modulanpas-", "Voraussichtliche")

#     return result_section1_dict

# def answer_questions(text,language="de"):
#     # Initialize the zero-shot classification pipeline
#     model_name = "deepset/gelectra-large-germanquad"
#     model = AutoModelForQuestionAnswering.from_pretrained(model_name)
#     tokenizer = AutoTokenizer.from_pretrained(model_name)

#     # Initialize the QA pipeline
#     qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
#     questions = [
#         "Welches ist das Titel des Moduls?",
#         "Welches ist das Sektor oder das Kernthema?",
#         "Welches ist das Land?",
#         "Zu welchem Program oder EZ-Programm gehort das Projekt?"
#         #"Welche Durchführungsorganisation aus den 4 Varianten 'giz', 'kfw', 'ptb' und 'bgr' implementiert das Projekt?"
#         # "In dem Dokument was steht bei Sektor?",
#         # "In dem Dokument was steht von 'EZ-Programm' bis 'EZ-Programmziel'?",
#         # "In dem Dokument was steht bei EZ-Programmziel?",
#         # "In dem Dokument in dem Abschnitt '1. Kurzbeschreibung' was steht bei Modul?",
#         # "In dem Dokument was steht bei Zielerreichung des Moduls?",
#         # "In dem Dokument in dem Abschnitt '1. Kurzbeschreibung' was steht bei Maßnahme im Zeitplan?",
#         # "In dem Dokument was steht bei Vorschläge zur Modulanpassung?",
#         # "In dem Dokument in dem Abschnitt 'Anlage 1: Wirkungsmatrix des Moduls' was steht unter Laufzeit als erstes Datum?",
#         # "In dem Dokument in dem Abschnitt 'Anlage 1: Wirkungsmatrix des Moduls' was steht unter Laufzeit als zweites Datum?"
#     ]

#     # Iterate over each question and get answers
#     answers_dict = {}

#     for question in questions:
#         result = qa_pipeline(question=question, context=text)
#         # print(f"Question: {question}")
#         # print(f"Answer: {result['answer']}\n")
#         answers_dict[question] = result['answer']
#     return answers_dict


# def process_pdf(path):
#     results_dict = {}
#     results_dict["1. Kurzbeschreibung"] = \
#         get_section(path, "1. Kurzbeschreibung", "2. Einordnung des Moduls")
#     answers = answer_questions(results_dict["1. Kurzbeschreibung"])
#     return answers

# def get_first_page_text(file_data):
#     doc = pdfplumber.open(BytesIO(file_data)) 
#     if len(doc.pages):
#         return doc.pages[0].extract_text()

# if __name__ == "__main__":
    
#     # Define the Gradio interface
#     # iface = gr.Interface(fn=process_pdf, 
#     # demo = gr.Interface(fn=process_pdf, 
#     #                  inputs=gr.File(type="binary", label="Upload PDF"),
#     #                  outputs=gr.Textbox(label="Extracted Text"),
#     #                  title="PDF Text Extractor",
#     #                  description="Upload a PDF file to extract.")
#     # demo.launch()
#      demo = gr.Interface(fn=process_pdf, 
#                      inputs=gr.File(type="pdf"),
#                      outputs="text,
#                      title="PDF Text Extractor",
#                      description="Upload a PDF file to extract.")
#     demo.launch()

import gradio as gr
import pdfplumber
from transformers import pipeline
from io import BytesIO
import re

# Initialize the question-answering pipeline with a specific pre-trained model
qa_pipeline = pipeline("question-answering", model="deepset/gelectra-large-germanquad")

def extract_text_from_pdf(file_obj):
    """Extracts text from a PDF file."""
    text = []
    with pdfplumber.open(file_obj) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:  # Make sure there's text on the page
                text.append(page_text)
    return " ".join(text)

def answer_questions(context):
    """Generates answers to predefined questions based on the provided context."""
    questions = [
        "Welches ist das Titel des Moduls?",
        "Welches ist das Sektor oder das Kernthema?",
        "Welches ist das Land?",
        "Zu welchem Program oder EZ-Programm gehört das Projekt?"
    ]
    answers = {q: qa_pipeline(question=q, context=context)['answer'] for q in questions}
    return answers

def process_pdf(file):
    """Process a PDF file to extract text and then use the text to answer questions."""
    # Read the PDF file from Gradio's file input, which is a temporary file path
    with file as file_path:
        text = extract_text_from_pdf(BytesIO(file_path.read()))
        results = answer_questions(text)
        return "\n".join(f"{q}: {a}" for q, a in results.items())

# Define the Gradio interface
iface = gr.Interface(
    fn=process_pdf,
    inputs=gr.inputs.File(type="pdf", label="Upload your PDF file"),
    outputs=gr.outputs.Textbox(label="Extracted Information and Answers"),
    title="PDF Text Extractor and Question Answerer",
    description="Upload a PDF file to extract text and answer predefined questions based on the content."
)

if __name__ == "__main__":
    iface.launch()