bolete / app.py
apjanco
in progress, need search engine
4c042d9
raw
history blame
No virus
1.61 kB
import re
import streamlit as st
import textract
import tempfile
import random
from pathlib import Path
import spacy
from spacy.tokens import DocBin
import srsly
from spacy.matcher import Matcher
st.title('Index and Search a Collection of Documents')
@st.cache
def download_model(select_model:str):
with st.spinner(f'Loading model {select_model}'):
spacy.cli.download(select_model)
return True
doc_bin = DocBin()
models = srsly.read_json('models.json')
models[''] = [] #require the user to choose a language
languages = models.keys()
language = st.selectbox("Language", languages, index=len(models.keys())-1, help="Select the language of your materials.")
if language:
select_model = st.selectbox("Model", models[language], help="spaCy model")
if select_model:
model_downloaded = download_model(select_model)
if model_downloaded:
nlp = spacy.load(select_model)
nlp.max_length = 1200000
uploaded_files = st.file_uploader("Select files to process", accept_multiple_files=True)
for uploaded_file in uploaded_files:
file_type = uploaded_file.type
temp = tempfile.NamedTemporaryFile()
temp.write(uploaded_file.getvalue())
try:
text = textract.process(temp.name)
text = text.decode('utf-8')
doc = nlp(text)
st.write(text)
except Exception as e:
st.error(e)
#st.download_button('Download', '', 'text/plain')