andreeabodea commited on
Commit
bb7fdc2
1 Parent(s): 6dcfc3f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +172 -116
app.py CHANGED
@@ -1,128 +1,184 @@
1
- import os
2
- import pdfplumber
3
- import re
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import gradio as gr
5
- from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
 
6
  from io import BytesIO
7
- import torch
8
-
9
- """
10
- Extract the text from a section of a PDF file between 'wanted_section' and 'next_section'.
11
- Parameters:
12
- - path (str): The file path to the PDF file.
13
- - wanted_section (str): The section to start extracting text from.
14
- - next_section (str): The section to stop extracting text at.
15
- Returns:
16
- - text (str): The extracted text from the specified section range.
17
- """
18
-
19
-
20
- def get_section(path, wanted_section, next_section):
21
- print(wanted_section)
22
-
23
- # Open the PDF file
24
- doc = pdfplumber.open(BytesIO(path))
25
- start_page = []
26
- end_page = []
27
-
28
- # Find the all the pages for the specified sections
29
- for page in range(len(doc.pages)):
30
- if len(doc.pages[page].search(wanted_section, return_chars=False, case=False)) > 0:
31
- start_page.append(page)
32
- if len(doc.pages[page].search(next_section, return_chars=False, case=False)) > 0:
33
- end_page.append(page)
34
-
35
- # Extract the text between the start and end page of the wanted section
36
  text = []
37
- for page_num in range(max(start_page), max(end_page)+1):
38
- page = doc.pages[page_num]
39
- text.append(page.extract_text())
40
- text = " ".join(text)
41
- final_text = text.replace("\n", " ")
42
- return final_text
43
-
44
-
45
- def extract_between(big_string, start_string, end_string):
46
- # Use a non-greedy match for content between start_string and end_string
47
- pattern = re.escape(start_string) + '(.*?)' + re.escape(end_string)
48
- match = re.search(pattern, big_string, re.DOTALL)
49
-
50
- if match:
51
- # Return the content without the start and end strings
52
- return match.group(1)
53
- else:
54
- # Return None if the pattern is not found
55
- return None
56
-
57
- def format_section1(section1_text):
58
- result_section1_dict = {}
59
-
60
- result_section1_dict['TOPIC'] = extract_between(section1_text, "Sektor", "EZ-Programm")
61
- result_section1_dict['PROGRAM'] = extract_between(section1_text, "Sektor", "EZ-Programm")
62
- result_section1_dict['PROJECT DESCRIPTION'] = extract_between(section1_text, "EZ-Programmziel", "Datum der letzten BE")
63
- result_section1_dict['PROJECT NAME'] = extract_between(section1_text, "Modul", "Modulziel")
64
- result_section1_dict['OBJECTIVE'] = extract_between(section1_text, "Modulziel", "Berichtszeitraum")
65
- result_section1_dict['PROGRESS'] = extract_between(section1_text, "Zielerreichung des Moduls", "Massnahme im Zeitplan")
66
- result_section1_dict['STATUS'] = extract_between(section1_text, "Massnahme im Zeitplan", "Risikoeinschätzung")
67
- result_section1_dict['RECOMMENDATIONS'] = extract_between(section1_text, "Vorschläge zur Modulanpas-", "Voraussichtliche")
68
-
69
- return result_section1_dict
70
-
71
- def answer_questions(text,language="de"):
72
- # Initialize the zero-shot classification pipeline
73
- model_name = "deepset/gelectra-large-germanquad"
74
- model = AutoModelForQuestionAnswering.from_pretrained(model_name)
75
- tokenizer = AutoTokenizer.from_pretrained(model_name)
76
-
77
- # Initialize the QA pipeline
78
- qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
79
  questions = [
80
  "Welches ist das Titel des Moduls?",
81
  "Welches ist das Sektor oder das Kernthema?",
82
  "Welches ist das Land?",
83
- "Zu welchem Program oder EZ-Programm gehort das Projekt?"
84
- #"Welche Durchführungsorganisation aus den 4 Varianten 'giz', 'kfw', 'ptb' und 'bgr' implementiert das Projekt?"
85
- # "In dem Dokument was steht bei Sektor?",
86
- # "In dem Dokument was steht von 'EZ-Programm' bis 'EZ-Programmziel'?",
87
- # "In dem Dokument was steht bei EZ-Programmziel?",
88
- # "In dem Dokument in dem Abschnitt '1. Kurzbeschreibung' was steht bei Modul?",
89
- # "In dem Dokument was steht bei Zielerreichung des Moduls?",
90
- # "In dem Dokument in dem Abschnitt '1. Kurzbeschreibung' was steht bei Maßnahme im Zeitplan?",
91
- # "In dem Dokument was steht bei Vorschläge zur Modulanpassung?",
92
- # "In dem Dokument in dem Abschnitt 'Anlage 1: Wirkungsmatrix des Moduls' was steht unter Laufzeit als erstes Datum?",
93
- # "In dem Dokument in dem Abschnitt 'Anlage 1: Wirkungsmatrix des Moduls' was steht unter Laufzeit als zweites Datum?"
94
  ]
95
-
96
- # Iterate over each question and get answers
97
- answers_dict = {}
98
-
99
- for question in questions:
100
- result = qa_pipeline(question=question, context=text)
101
- # print(f"Question: {question}")
102
- # print(f"Answer: {result['answer']}\n")
103
- answers_dict[question] = result['answer']
104
- return answers_dict
105
-
106
-
107
- def process_pdf(path):
108
- results_dict = {}
109
- results_dict["1. Kurzbeschreibung"] = \
110
- get_section(path, "1. Kurzbeschreibung", "2. Einordnung des Moduls")
111
- answers = answer_questions(results_dict["1. Kurzbeschreibung"])
112
  return answers
113
 
114
- def get_first_page_text(file_data):
115
- doc = pdfplumber.open(BytesIO(file_data))
116
- if len(doc.pages):
117
- return doc.pages[0].extract_text()
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
  if __name__ == "__main__":
120
-
121
- # Define the Gradio interface
122
- # iface = gr.Interface(fn=process_pdf,
123
- demo = gr.Interface(fn=process_pdf,
124
- inputs=gr.File(type="binary", label="Upload PDF"),
125
- outputs=gr.Textbox(label="Extracted Text"),
126
- title="PDF Text Extractor",
127
- description="Upload a PDF file to extract.")
128
- demo.launch()
 
1
+ # import os
2
+ # import pdfplumber
3
+ # import re
4
+ # import gradio as gr
5
+ # from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
6
+ # from io import BytesIO
7
+ # import torch
8
+
9
+ # """
10
+ # Extract the text from a section of a PDF file between 'wanted_section' and 'next_section'.
11
+ # Parameters:
12
+ # - path (str): The file path to the PDF file.
13
+ # - wanted_section (str): The section to start extracting text from.
14
+ # - next_section (str): The section to stop extracting text at.
15
+ # Returns:
16
+ # - text (str): The extracted text from the specified section range.
17
+ # """
18
+
19
+
20
+ # def get_section(path, wanted_section, next_section):
21
+ # print(wanted_section)
22
+
23
+ # # Open the PDF file
24
+ # doc = pdfplumber.open(BytesIO(path))
25
+ # start_page = []
26
+ # end_page = []
27
+
28
+ # # Find the all the pages for the specified sections
29
+ # for page in range(len(doc.pages)):
30
+ # if len(doc.pages[page].search(wanted_section, return_chars=False, case=False)) > 0:
31
+ # start_page.append(page)
32
+ # if len(doc.pages[page].search(next_section, return_chars=False, case=False)) > 0:
33
+ # end_page.append(page)
34
+
35
+ # # Extract the text between the start and end page of the wanted section
36
+ # text = []
37
+ # for page_num in range(max(start_page), max(end_page)+1):
38
+ # page = doc.pages[page_num]
39
+ # text.append(page.extract_text())
40
+ # text = " ".join(text)
41
+ # final_text = text.replace("\n", " ")
42
+ # return final_text
43
+
44
+
45
+ # def extract_between(big_string, start_string, end_string):
46
+ # # Use a non-greedy match for content between start_string and end_string
47
+ # pattern = re.escape(start_string) + '(.*?)' + re.escape(end_string)
48
+ # match = re.search(pattern, big_string, re.DOTALL)
49
+
50
+ # if match:
51
+ # # Return the content without the start and end strings
52
+ # return match.group(1)
53
+ # else:
54
+ # # Return None if the pattern is not found
55
+ # return None
56
+
57
+ # def format_section1(section1_text):
58
+ # result_section1_dict = {}
59
+
60
+ # result_section1_dict['TOPIC'] = extract_between(section1_text, "Sektor", "EZ-Programm")
61
+ # result_section1_dict['PROGRAM'] = extract_between(section1_text, "Sektor", "EZ-Programm")
62
+ # result_section1_dict['PROJECT DESCRIPTION'] = extract_between(section1_text, "EZ-Programmziel", "Datum der letzten BE")
63
+ # result_section1_dict['PROJECT NAME'] = extract_between(section1_text, "Modul", "Modulziel")
64
+ # result_section1_dict['OBJECTIVE'] = extract_between(section1_text, "Modulziel", "Berichtszeitraum")
65
+ # result_section1_dict['PROGRESS'] = extract_between(section1_text, "Zielerreichung des Moduls", "Massnahme im Zeitplan")
66
+ # result_section1_dict['STATUS'] = extract_between(section1_text, "Massnahme im Zeitplan", "Risikoeinschätzung")
67
+ # result_section1_dict['RECOMMENDATIONS'] = extract_between(section1_text, "Vorschläge zur Modulanpas-", "Voraussichtliche")
68
+
69
+ # return result_section1_dict
70
+
71
+ # def answer_questions(text,language="de"):
72
+ # # Initialize the zero-shot classification pipeline
73
+ # model_name = "deepset/gelectra-large-germanquad"
74
+ # model = AutoModelForQuestionAnswering.from_pretrained(model_name)
75
+ # tokenizer = AutoTokenizer.from_pretrained(model_name)
76
+
77
+ # # Initialize the QA pipeline
78
+ # qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
79
+ # questions = [
80
+ # "Welches ist das Titel des Moduls?",
81
+ # "Welches ist das Sektor oder das Kernthema?",
82
+ # "Welches ist das Land?",
83
+ # "Zu welchem Program oder EZ-Programm gehort das Projekt?"
84
+ # #"Welche Durchführungsorganisation aus den 4 Varianten 'giz', 'kfw', 'ptb' und 'bgr' implementiert das Projekt?"
85
+ # # "In dem Dokument was steht bei Sektor?",
86
+ # # "In dem Dokument was steht von 'EZ-Programm' bis 'EZ-Programmziel'?",
87
+ # # "In dem Dokument was steht bei EZ-Programmziel?",
88
+ # # "In dem Dokument in dem Abschnitt '1. Kurzbeschreibung' was steht bei Modul?",
89
+ # # "In dem Dokument was steht bei Zielerreichung des Moduls?",
90
+ # # "In dem Dokument in dem Abschnitt '1. Kurzbeschreibung' was steht bei Maßnahme im Zeitplan?",
91
+ # # "In dem Dokument was steht bei Vorschläge zur Modulanpassung?",
92
+ # # "In dem Dokument in dem Abschnitt 'Anlage 1: Wirkungsmatrix des Moduls' was steht unter Laufzeit als erstes Datum?",
93
+ # # "In dem Dokument in dem Abschnitt 'Anlage 1: Wirkungsmatrix des Moduls' was steht unter Laufzeit als zweites Datum?"
94
+ # ]
95
+
96
+ # # Iterate over each question and get answers
97
+ # answers_dict = {}
98
+
99
+ # for question in questions:
100
+ # result = qa_pipeline(question=question, context=text)
101
+ # # print(f"Question: {question}")
102
+ # # print(f"Answer: {result['answer']}\n")
103
+ # answers_dict[question] = result['answer']
104
+ # return answers_dict
105
+
106
+
107
+ # def process_pdf(path):
108
+ # results_dict = {}
109
+ # results_dict["1. Kurzbeschreibung"] = \
110
+ # get_section(path, "1. Kurzbeschreibung", "2. Einordnung des Moduls")
111
+ # answers = answer_questions(results_dict["1. Kurzbeschreibung"])
112
+ # return answers
113
+
114
+ # def get_first_page_text(file_data):
115
+ # doc = pdfplumber.open(BytesIO(file_data))
116
+ # if len(doc.pages):
117
+ # return doc.pages[0].extract_text()
118
+
119
+ # if __name__ == "__main__":
120
+
121
+ # # Define the Gradio interface
122
+ # # iface = gr.Interface(fn=process_pdf,
123
+ # # demo = gr.Interface(fn=process_pdf,
124
+ # # inputs=gr.File(type="binary", label="Upload PDF"),
125
+ # # outputs=gr.Textbox(label="Extracted Text"),
126
+ # # title="PDF Text Extractor",
127
+ # # description="Upload a PDF file to extract.")
128
+ # # demo.launch()
129
+ # demo = gr.Interface(fn=process_pdf,
130
+ # inputs=gr.File(type="pdf"),
131
+ # outputs="text,
132
+ # title="PDF Text Extractor",
133
+ # description="Upload a PDF file to extract.")
134
+ # demo.launch()
135
+
136
  import gradio as gr
137
+ import pdfplumber
138
+ from transformers import pipeline
139
  from io import BytesIO
140
+ import re
141
+
142
+ # Initialize the question-answering pipeline with a specific pre-trained model
143
+ qa_pipeline = pipeline("question-answering", model="deepset/gelectra-large-germanquad")
144
+
145
+ def extract_text_from_pdf(file_obj):
146
+ """Extracts text from a PDF file."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  text = []
148
+ with pdfplumber.open(file_obj) as pdf:
149
+ for page in pdf.pages:
150
+ page_text = page.extract_text()
151
+ if page_text: # Make sure there's text on the page
152
+ text.append(page_text)
153
+ return " ".join(text)
154
+
155
+ def answer_questions(context):
156
+ """Generates answers to predefined questions based on the provided context."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  questions = [
158
  "Welches ist das Titel des Moduls?",
159
  "Welches ist das Sektor oder das Kernthema?",
160
  "Welches ist das Land?",
161
+ "Zu welchem Program oder EZ-Programm gehört das Projekt?"
 
 
 
 
 
 
 
 
 
 
162
  ]
163
+ answers = {q: qa_pipeline(question=q, context=context)['answer'] for q in questions}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  return answers
165
 
166
+ def process_pdf(file):
167
+ """Process a PDF file to extract text and then use the text to answer questions."""
168
+ # Read the PDF file from Gradio's file input, which is a temporary file path
169
+ with file as file_path:
170
+ text = extract_text_from_pdf(BytesIO(file_path.read()))
171
+ results = answer_questions(text)
172
+ return "\n".join(f"{q}: {a}" for q, a in results.items())
173
+
174
+ # Define the Gradio interface
175
+ iface = gr.Interface(
176
+ fn=process_pdf,
177
+ inputs=gr.inputs.File(type="pdf", label="Upload your PDF file"),
178
+ outputs=gr.outputs.Textbox(label="Extracted Information and Answers"),
179
+ title="PDF Text Extractor and Question Answerer",
180
+ description="Upload a PDF file to extract text and answer predefined questions based on the content."
181
+ )
182
 
183
  if __name__ == "__main__":
184
+ iface.launch()