apjanco commited on
Commit
4c042d9
1 Parent(s): ce57bf8

in progress, need search engine

Browse files
Files changed (4) hide show
  1. app.py +53 -0
  2. models.json +149 -0
  3. packages.txt +15 -0
  4. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import streamlit as st
3
+ import textract
4
+ import tempfile
5
+ import random
6
+ from pathlib import Path
7
+ import spacy
8
+ from spacy.tokens import DocBin
9
+ import srsly
10
+ from spacy.matcher import Matcher
11
+
12
+ st.title('Index and Search a Collection of Documents')
13
+
14
+ @st.cache
15
+ def download_model(select_model:str):
16
+ with st.spinner(f'Loading model {select_model}'):
17
+ spacy.cli.download(select_model)
18
+ return True
19
+
20
+ doc_bin = DocBin()
21
+ models = srsly.read_json('models.json')
22
+ models[''] = [] #require the user to choose a language
23
+ languages = models.keys()
24
+ language = st.selectbox("Language", languages, index=len(models.keys())-1, help="Select the language of your materials.")
25
+ if language:
26
+ select_model = st.selectbox("Model", models[language], help="spaCy model")
27
+ if select_model:
28
+ model_downloaded = download_model(select_model)
29
+
30
+ if model_downloaded:
31
+
32
+ nlp = spacy.load(select_model)
33
+
34
+ nlp.max_length = 1200000
35
+
36
+ uploaded_files = st.file_uploader("Select files to process", accept_multiple_files=True)
37
+
38
+ for uploaded_file in uploaded_files:
39
+ file_type = uploaded_file.type
40
+
41
+ temp = tempfile.NamedTemporaryFile()
42
+ temp.write(uploaded_file.getvalue())
43
+ try:
44
+ text = textract.process(temp.name)
45
+ text = text.decode('utf-8')
46
+ doc = nlp(text)
47
+ st.write(text)
48
+ except Exception as e:
49
+ st.error(e)
50
+
51
+
52
+ #st.download_button('Download', '', 'text/plain')
53
+
models.json ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Catalan": [
3
+ "",
4
+ "ca_core_news_sm",
5
+ "ca_core_news_md",
6
+ "ca_core_news_lg",
7
+ "ca_core_news_trf"
8
+ ],
9
+ "Chinese":[
10
+ "",
11
+ "zh_core_web_sm",
12
+ "zh_core_web_md",
13
+ "zh_core_web_lg",
14
+ "zh_core_web_trf"
15
+ ],
16
+ "Croatian": [
17
+ "",
18
+ "hr_core_news_sm",
19
+ "hr_core_news_md",
20
+ "hr_core_news_lg"
21
+ ],
22
+ "Danish": [
23
+ "",
24
+ "da_core_news_sm",
25
+ "da_core_news_md",
26
+ "da_core_news_lg",
27
+ "da_core_news_trf"
28
+ ],
29
+ "Dutch":[
30
+ "",
31
+ "nl_core_news_sm",
32
+ "nl_core_news_md",
33
+ "nl_core_news_lg"
34
+ ],
35
+ "English": [
36
+ "",
37
+ "en_core_web_sm",
38
+ "en_core_web_md",
39
+ "en_core_web_lg",
40
+ "en_core_web_trf"
41
+ ],
42
+ "Finnish":[
43
+ "",
44
+ "fi_core_news_sm",
45
+ "fi_core_news_md",
46
+ "fi_core_news_lg"
47
+ ],
48
+ "French":[
49
+ "",
50
+ "fr_core_news_sm",
51
+ "fr_core_news_md",
52
+ "fr_core_news_lg",
53
+ "fr_dep_news_trf"
54
+ ],
55
+ "German": [
56
+ "",
57
+ "de_core_news_sm",
58
+ "de_core_news_md",
59
+ "de_core_news_lg",
60
+ "de_dep_news_trf"
61
+ ],
62
+ "Greek":[
63
+ "",
64
+ "el_core_news_sm",
65
+ "el_core_news_md",
66
+ "el_core_news_lg"
67
+ ],
68
+ "Italian":[
69
+ "",
70
+ "it_core_news_sm",
71
+ "it_core_news_md",
72
+ "it_core_news_lg"
73
+ ],
74
+ "Japanese":[
75
+ "",
76
+ "ja_core_news_sm",
77
+ "ja_core_news_md",
78
+ "ja_core_news_lg",
79
+ "ja_core_news_trf"
80
+ ],
81
+ "Korean":[
82
+ "",
83
+ "ko_core_news_sm",
84
+ "ko_core_news_md",
85
+ "ko_core_news_lg"
86
+ ],
87
+ "Lithuanian":[
88
+ "",
89
+ "lt_core_news_sm",
90
+ "lt_core_news_md",
91
+ "lt_core_news_lg"
92
+ ],
93
+ "Macedonian":[
94
+ "",
95
+ "mk_core_news_sm",
96
+ "mk_core_news_md",
97
+ "mk_core_news_lg"
98
+ ],
99
+ "Multi-language":[
100
+ "",
101
+ "xx_ent_wiki_sm",
102
+ "xx_sent_ud_sm"
103
+ ],
104
+ "Norwegian Bokmål":[
105
+ "",
106
+ "nb_core_news_sm",
107
+ "nb_core_news_md",
108
+ "nb_core_news_lg"
109
+ ],
110
+ "Polish":[
111
+ "",
112
+ "pl_core_news_sm",
113
+ "pl_core_news_md",
114
+ "pl_core_news_lg"
115
+ ],
116
+ "Portuguese":[
117
+ "",
118
+ "pt_core_news_sm",
119
+ "pt_core_news_md",
120
+ "pt_core_news_lg"
121
+ ],
122
+ "Romanian":[
123
+ "",
124
+ "ro_core_news_sm",
125
+ "ro_core_news_md",
126
+ "ro_core_news_lg"
127
+ ],
128
+ "Russian":[
129
+ "",
130
+ "ru_core_news_sm",
131
+ "ru_core_news_md",
132
+ "ru_core_news_lg"
133
+ ],
134
+ "Spanish":[
135
+ "",
136
+ "es_core_news_sm",
137
+ "es_core_news_md",
138
+ "es_core_news_lg",
139
+ "es_dep_news_trf"
140
+ ],
141
+ "Swedish":[
142
+ "",
143
+ "sv_core_news_sm",
144
+ "sv_core_news_md",
145
+ "sv_core_news_lg"
146
+
147
+ ]
148
+
149
+ }
packages.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ python-dev
2
+ libxml2-dev
3
+ libxslt1-dev
4
+ antiword
5
+ unrtf
6
+ poppler-utils
7
+ tesseract-ocr
8
+ flac
9
+ ffmpeg
10
+ lame
11
+ libmad0
12
+ libsox-fmt-mp3
13
+ sox
14
+ libjpeg-dev
15
+ swig
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ streamlit==1.11.1
2
+ textract==1.6.5
3
+ spacy==3.4.1
4
+ #spacy-streamlit==1.0.4