import re import streamlit as st import textract import tempfile import random from pathlib import Path import spacy from spacy.tokens import DocBin import srsly from spacy.matcher import Matcher st.title('Index and Search a Collection of Documents') @st.cache def download_model(select_model:str): with st.spinner(f'Loading model {select_model}'): spacy.cli.download(select_model) return True doc_bin = DocBin() models = srsly.read_json('models.json') models[''] = [] #require the user to choose a language languages = models.keys() language = st.selectbox("Language", languages, index=len(models.keys())-1, help="Select the language of your materials.") if language: select_model = st.selectbox("Model", models[language], help="spaCy model") if select_model: model_downloaded = download_model(select_model) if model_downloaded: nlp = spacy.load(select_model) nlp.max_length = 1200000 uploaded_files = st.file_uploader("Select files to process", accept_multiple_files=True) for uploaded_file in uploaded_files: file_type = uploaded_file.type temp = tempfile.NamedTemporaryFile() temp.write(uploaded_file.getvalue()) try: text = textract.process(temp.name) text = text.decode('utf-8') doc = nlp(text) st.write(text) except Exception as e: st.error(e) #st.download_button('Download', '', 'text/plain')