ugmSorcero commited on
Commit
46323da
1 Parent(s): 753ae25

Adds image to text and tesseract linux dependencies

Browse files
interface/components.py CHANGED
@@ -80,6 +80,11 @@ def component_article_url(container):
80
  st.markdown("---")
81
  else:
82
  break
 
 
 
 
 
83
  corpus = [
84
  {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(urls)
85
  ]
@@ -93,7 +98,7 @@ def component_file_input(container):
93
  doc_id = 1
94
  with st.expander("Enter Files"):
95
  while True:
96
- file = st.file_uploader("Upload a .txt, .pdf, .csv file", key=doc_id)
97
  if file != None:
98
  extracted_text = extract_text_from_file(file)
99
  if extracted_text != None:
@@ -104,6 +109,11 @@ def component_file_input(container):
104
  break
105
  else:
106
  break
 
 
 
 
 
107
  corpus = [
108
  {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(files)
109
  ]
 
80
  st.markdown("---")
81
  else:
82
  break
83
+
84
+ for idx, doc in enumerate(urls):
85
+ with st.expander(f"Preview URL {idx}"):
86
+ st.write(doc)
87
+
88
  corpus = [
89
  {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(urls)
90
  ]
 
98
  doc_id = 1
99
  with st.expander("Enter Files"):
100
  while True:
101
+ file = st.file_uploader("Upload a .txt, .pdf, .csv, image file", key=doc_id)
102
  if file != None:
103
  extracted_text = extract_text_from_file(file)
104
  if extracted_text != None:
 
109
  break
110
  else:
111
  break
112
+
113
+ for idx, doc in enumerate(files):
114
+ with st.expander(f"Preview File {idx}"):
115
+ st.write(doc)
116
+
117
  corpus = [
118
  {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(files)
119
  ]
interface/pages.py CHANGED
@@ -26,7 +26,6 @@ def page_landing_page(container):
26
  st.markdown(
27
  "TODO list:"
28
  "\n - Build other pipelines"
29
- "\n - Include file/url indexing"
30
  "\n - [Optional] Include text to audio to read responses"
31
  )
32
 
 
26
  st.markdown(
27
  "TODO list:"
28
  "\n - Build other pipelines"
 
29
  "\n - [Optional] Include text to audio to read responses"
30
  )
31
 
interface/utils.py CHANGED
@@ -5,7 +5,8 @@ from newspaper import Article
5
  from PyPDF2 import PdfFileReader
6
  import streamlit as st
7
  import pandas as pd
8
-
 
9
 
10
  def get_pipelines():
11
  pipeline_names, pipeline_funcs = list(
@@ -25,7 +26,7 @@ def extract_text_from_url(url: str):
25
 
26
  return article.text
27
 
28
-
29
  def extract_text_from_file(file):
30
  # read text file
31
  if file.type == "text/plain":
@@ -76,6 +77,10 @@ def extract_text_from_file(file):
76
  continue
77
  file_text += " " + txt
78
  return file_text
 
 
 
 
79
 
80
  else:
81
  st.warning(f"File type {file.type} not supported")
 
5
  from PyPDF2 import PdfFileReader
6
  import streamlit as st
7
  import pandas as pd
8
+ import pytesseract
9
+ from PIL import Image
10
 
11
  def get_pipelines():
12
  pipeline_names, pipeline_funcs = list(
 
26
 
27
  return article.text
28
 
29
+ @st.experimental_memo
30
  def extract_text_from_file(file):
31
  # read text file
32
  if file.type == "text/plain":
 
77
  continue
78
  file_text += " " + txt
79
  return file_text
80
+
81
+ # read image file (OCR)
82
+ elif file.type == 'image/jpeg':
83
+ return pytesseract.image_to_string(Image.open(file))
84
 
85
  else:
86
  st.warning(f"File type {file.type} not supported")
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ tesseract-ocr-all
requirements.txt CHANGED
@@ -4,4 +4,5 @@ farm-haystack==1.8.0
4
  black==22.8.0
5
  plotly==5.10.0
6
  newspaper3k==0.2.8
7
- PyPDF2==2.10.7
 
 
4
  black==22.8.0
5
  plotly==5.10.0
6
  newspaper3k==0.2.8
7
+ PyPDF2==2.10.7
8
+ pytesseract==0.3.10