kxx-kkk commited on
Commit
e3706e0
·
verified ·
1 Parent(s): fd3d53b

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -25
app.py CHANGED
@@ -30,6 +30,30 @@ def question_model():
30
  question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer, handle_impossible_answer=True)
31
  return question_answerer
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  # # get the answer by passing the context & question to the model
34
  # def question_answering(context, question):
35
  # with st.spinner(text="Loading question model..."):
@@ -90,29 +114,6 @@ def question_answering(context, question):
90
  container.write("<h5><b>Answer:</b></h5>" + answer + "<p><small>(F1 score: " + answer_score + ")</small></p><br>",
91
  unsafe_allow_html=True)
92
 
93
- @st.cache_data(show_spinner=False)
94
- def extract_text(file_path):
95
- text = ""
96
- image_text = ""
97
- with st.spinner(text="Extracting text from file..."):
98
- with open(file_path, "rb") as pdf_file:
99
- pdf_reader = PyPDF2.PdfReader(pdf_file)
100
- num_pages = len(pdf_reader.pages)
101
-
102
- for page_number in range(num_pages):
103
- # st.write(f"Page {page_number + 1}")
104
- page = pdf_reader.pages[page_number]
105
- text += page.extract_text()
106
-
107
- images = convert_from_path(file_path) # Convert PDF pages to images
108
- for i, image in enumerate(images):
109
- image_text += pytesseract.image_to_string(image)
110
-
111
- # text = text + image_text
112
- text = image_text
113
- # remove more than one new line
114
- text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)
115
- return text
116
 
117
 
118
  #-------------------- Main Webpage --------------------
@@ -178,8 +179,8 @@ with tab2:
178
  if not st.session_state.text_extracted:
179
  with tempfile.NamedTemporaryFile(delete=False) as temp_file:
180
  temp_file.write(uploaded_file.read()) # Save uploaded file to a temporary path
181
- raw_text = extract_text(temp_file.name)
182
- context2 = raw_text
183
  st.session_state.text_extracted = True
184
 
185
 
 
30
  question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer, handle_impossible_answer=True)
31
  return question_answerer
32
 
33
+ @st.cache_data(show_spinner=False)
34
+ def extract_text(file_path):
35
+ text = ""
36
+ image_text = ""
37
+ with st.spinner(text="Extracting text from file..."):
38
+ with open(file_path, "rb") as pdf_file:
39
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
40
+ num_pages = len(pdf_reader.pages)
41
+
42
+ for page_number in range(num_pages):
43
+ # st.write(f"Page {page_number + 1}")
44
+ page = pdf_reader.pages[page_number]
45
+ text += page.extract_text()
46
+
47
+ images = convert_from_path(file_path) # Convert PDF pages to images
48
+ for i, image in enumerate(images):
49
+ image_text += pytesseract.image_to_string(image)
50
+
51
+ # text = text + image_text
52
+ text = image_text
53
+ # remove more than one new line
54
+ text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)
55
+ return text
56
+
57
  # # get the answer by passing the context & question to the model
58
  # def question_answering(context, question):
59
  # with st.spinner(text="Loading question model..."):
 
114
  container.write("<h5><b>Answer:</b></h5>" + answer + "<p><small>(F1 score: " + answer_score + ")</small></p><br>",
115
  unsafe_allow_html=True)
116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
 
119
  #-------------------- Main Webpage --------------------
 
179
  if not st.session_state.text_extracted:
180
  with tempfile.NamedTemporaryFile(delete=False) as temp_file:
181
  temp_file.write(uploaded_file.read()) # Save uploaded file to a temporary path
182
+ raw_text = extract_text(temp_file.name)
183
+ context2 = raw_text
184
  st.session_state.text_extracted = True
185
 
186