import streamlit as st import textract import tempfile from typing import List import spacy from spacy.tokens import DocBin, Doc from collections import Counter import srsly from spacy.matcher import Matcher # Import CSS file with open("style.css") as f: st.markdown("", unsafe_allow_html=True) st.title('Index and Search a Collection of Documents') @st.cache def download_model(select_model:str): with st.spinner(f'Loading model {select_model}'): spacy.cli.download(select_model) return True def search_docs(query:str, matcher:Matcher, documents:List[Doc], nlp, match_pattern:str=None): qdoc = nlp(query) if match_pattern: pattern = match_pattern else: pattern = [] for token in qdoc: print('token',token.text.lower()) pattern.append({"LOWER": token.text.lower()}) matcher.add(query, [pattern]) results = [] for doc in documents: matches = matcher(doc) #List[(match_id, start, end)] #print('matches',matches) results.extend(matches) return results models = srsly.read_json('models.json') models[''] = [] #require the user to choose a language languages = models.keys() language = st.selectbox("Language", languages, index=len(models.keys())-1, help="Select the language of your materials.") if language: select_model = st.selectbox("Model", models[language], help="spaCy model") if select_model: model_downloaded = download_model(select_model) if model_downloaded: nlp = spacy.load(select_model) nlp.max_length = 1200000 matcher = Matcher(nlp.vocab) uploaded_files = st.file_uploader("Select files to process", accept_multiple_files=True) query = st.sidebar.text_input(label="Enter your query", value="...") documents = [] for uploaded_file in uploaded_files: file_type = uploaded_file.type file_suffix = '.' + uploaded_file.name.split('.')[-1] temp = tempfile.NamedTemporaryFile(suffix=file_suffix,) temp.write(uploaded_file.getvalue()) try: text = textract.process(temp.name) text = text.decode('utf-8') doc = nlp(text) documents.append(doc) ent_freq = Counter([ent.label_ for ent in doc.ents]) for key, value in ent_freq.items(): if st.sidebar.button(key): st.sidebar.write(value) for ent in doc.ents: if ent.label_ == key: st.sidebar.write(ent.text) except Exception as e: st.error(e) results = search_docs(query, matcher, documents,nlp) st.write(results) #st.download_button('Download', '', 'text/plain')