Spaces:
Runtime error
Runtime error
ugmSorcero
commited on
Commit
•
46323da
1
Parent(s):
753ae25
Adds image to text and tesseract linux dependencies
Browse files- interface/components.py +11 -1
- interface/pages.py +0 -1
- interface/utils.py +7 -2
- packages.txt +1 -0
- requirements.txt +2 -1
interface/components.py
CHANGED
@@ -80,6 +80,11 @@ def component_article_url(container):
|
|
80 |
st.markdown("---")
|
81 |
else:
|
82 |
break
|
|
|
|
|
|
|
|
|
|
|
83 |
corpus = [
|
84 |
{"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(urls)
|
85 |
]
|
@@ -93,7 +98,7 @@ def component_file_input(container):
|
|
93 |
doc_id = 1
|
94 |
with st.expander("Enter Files"):
|
95 |
while True:
|
96 |
-
file = st.file_uploader("Upload a .txt, .pdf, .csv file", key=doc_id)
|
97 |
if file != None:
|
98 |
extracted_text = extract_text_from_file(file)
|
99 |
if extracted_text != None:
|
@@ -104,6 +109,11 @@ def component_file_input(container):
|
|
104 |
break
|
105 |
else:
|
106 |
break
|
|
|
|
|
|
|
|
|
|
|
107 |
corpus = [
|
108 |
{"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(files)
|
109 |
]
|
|
|
80 |
st.markdown("---")
|
81 |
else:
|
82 |
break
|
83 |
+
|
84 |
+
for idx, doc in enumerate(urls):
|
85 |
+
with st.expander(f"Preview URL {idx}"):
|
86 |
+
st.write(doc)
|
87 |
+
|
88 |
corpus = [
|
89 |
{"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(urls)
|
90 |
]
|
|
|
98 |
doc_id = 1
|
99 |
with st.expander("Enter Files"):
|
100 |
while True:
|
101 |
+
file = st.file_uploader("Upload a .txt, .pdf, .csv, image file", key=doc_id)
|
102 |
if file != None:
|
103 |
extracted_text = extract_text_from_file(file)
|
104 |
if extracted_text != None:
|
|
|
109 |
break
|
110 |
else:
|
111 |
break
|
112 |
+
|
113 |
+
for idx, doc in enumerate(files):
|
114 |
+
with st.expander(f"Preview File {idx}"):
|
115 |
+
st.write(doc)
|
116 |
+
|
117 |
corpus = [
|
118 |
{"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(files)
|
119 |
]
|
interface/pages.py
CHANGED
@@ -26,7 +26,6 @@ def page_landing_page(container):
|
|
26 |
st.markdown(
|
27 |
"TODO list:"
|
28 |
"\n - Build other pipelines"
|
29 |
-
"\n - Include file/url indexing"
|
30 |
"\n - [Optional] Include text to audio to read responses"
|
31 |
)
|
32 |
|
|
|
26 |
st.markdown(
|
27 |
"TODO list:"
|
28 |
"\n - Build other pipelines"
|
|
|
29 |
"\n - [Optional] Include text to audio to read responses"
|
30 |
)
|
31 |
|
interface/utils.py
CHANGED
@@ -5,7 +5,8 @@ from newspaper import Article
|
|
5 |
from PyPDF2 import PdfFileReader
|
6 |
import streamlit as st
|
7 |
import pandas as pd
|
8 |
-
|
|
|
9 |
|
10 |
def get_pipelines():
|
11 |
pipeline_names, pipeline_funcs = list(
|
@@ -25,7 +26,7 @@ def extract_text_from_url(url: str):
|
|
25 |
|
26 |
return article.text
|
27 |
|
28 |
-
|
29 |
def extract_text_from_file(file):
|
30 |
# read text file
|
31 |
if file.type == "text/plain":
|
@@ -76,6 +77,10 @@ def extract_text_from_file(file):
|
|
76 |
continue
|
77 |
file_text += " " + txt
|
78 |
return file_text
|
|
|
|
|
|
|
|
|
79 |
|
80 |
else:
|
81 |
st.warning(f"File type {file.type} not supported")
|
|
|
5 |
from PyPDF2 import PdfFileReader
|
6 |
import streamlit as st
|
7 |
import pandas as pd
|
8 |
+
import pytesseract
|
9 |
+
from PIL import Image
|
10 |
|
11 |
def get_pipelines():
|
12 |
pipeline_names, pipeline_funcs = list(
|
|
|
26 |
|
27 |
return article.text
|
28 |
|
29 |
+
@st.experimental_memo
|
30 |
def extract_text_from_file(file):
|
31 |
# read text file
|
32 |
if file.type == "text/plain":
|
|
|
77 |
continue
|
78 |
file_text += " " + txt
|
79 |
return file_text
|
80 |
+
|
81 |
+
# read image file (OCR)
|
82 |
+
elif file.type == 'image/jpeg':
|
83 |
+
return pytesseract.image_to_string(Image.open(file))
|
84 |
|
85 |
else:
|
86 |
st.warning(f"File type {file.type} not supported")
|
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
tesseract-ocr-all
|
requirements.txt
CHANGED
@@ -4,4 +4,5 @@ farm-haystack==1.8.0
|
|
4 |
black==22.8.0
|
5 |
plotly==5.10.0
|
6 |
newspaper3k==0.2.8
|
7 |
-
PyPDF2==2.10.7
|
|
|
|
4 |
black==22.8.0
|
5 |
plotly==5.10.0
|
6 |
newspaper3k==0.2.8
|
7 |
+
PyPDF2==2.10.7
|
8 |
+
pytesseract==0.3.10
|