File size: 1,610 Bytes
4c042d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import re 
import streamlit as st
import textract
import tempfile
import random
from pathlib import Path
import spacy
from spacy.tokens import DocBin
import srsly
from spacy.matcher import Matcher

st.title('Index and Search a Collection of Documents')

@st.cache
def download_model(select_model:str):
    with st.spinner(f'Loading model {select_model}'):
        spacy.cli.download(select_model)
    return True

doc_bin = DocBin()
models = srsly.read_json('models.json')
models[''] = [] #require the user to choose a language
languages = models.keys()
language = st.selectbox("Language", languages, index=len(models.keys())-1, help="Select the language of your materials.")
if language:
    select_model = st.selectbox("Model", models[language], help="spaCy model")
    if select_model:
        model_downloaded = download_model(select_model)

        if model_downloaded:

            nlp = spacy.load(select_model)

            nlp.max_length = 1200000

            uploaded_files = st.file_uploader("Select files to process", accept_multiple_files=True)

            for uploaded_file in uploaded_files:
                file_type = uploaded_file.type

                temp = tempfile.NamedTemporaryFile()
                temp.write(uploaded_file.getvalue())
                try:
                    text = textract.process(temp.name)
                    text = text.decode('utf-8')
                    doc = nlp(text)
                    st.write(text)
                except Exception as e:
                    st.error(e)
                

            #st.download_button('Download', '', 'text/plain')