import streamlit as st import textract import tempfile import spacy from spacy.tokens import DocBin import srsly st.title('Index and Search a Collection of Documents') @st.cache def download_model(select_model:str): with st.spinner(f'Loading model {select_model}'): spacy.cli.download(select_model) return True doc_bin = DocBin() models = srsly.read_json('models.json') models[''] = [] #require the user to choose a language languages = models.keys() language = st.selectbox("Language", languages, index=len(models.keys())-1, help="Select the language of your materials.") if language: select_model = st.selectbox("Model", models[language], help="spaCy model") if select_model: model_downloaded = download_model(select_model) if model_downloaded: nlp = spacy.load(select_model) nlp.max_length = 1200000 uploaded_files = st.file_uploader("Select files to process", accept_multiple_files=True) for uploaded_file in uploaded_files: file_type = uploaded_file.type temp = tempfile.NamedTemporaryFile() temp.write(uploaded_file.getvalue()) try: text = textract.process(temp.name) text = text.decode('utf-8') doc = nlp(text) st.write(text) except Exception as e: st.error(e) #st.download_button('Download', '', 'text/plain')