first steps on search
Browse files- .gitignore +1 -0
- app.py +24 -3
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
venv/
|
app.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1 |
import streamlit as st
|
2 |
import textract
|
3 |
import tempfile
|
|
|
4 |
import spacy
|
5 |
from spacy.tokens import DocBin, Doc
|
6 |
from collections import Counter
|
7 |
import srsly
|
|
|
8 |
|
9 |
# Import CSS file
|
10 |
with open("style.css") as f:
|
@@ -18,8 +20,23 @@ def download_model(select_model:str):
|
|
18 |
spacy.cli.download(select_model)
|
19 |
return True
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
-
doc_bin = DocBin()
|
23 |
models = srsly.read_json('models.json')
|
24 |
models[''] = [] #require the user to choose a language
|
25 |
languages = models.keys()
|
@@ -34,9 +51,11 @@ if language:
|
|
34 |
nlp = spacy.load(select_model)
|
35 |
|
36 |
nlp.max_length = 1200000
|
|
|
37 |
|
38 |
uploaded_files = st.file_uploader("Select files to process", accept_multiple_files=True)
|
39 |
-
|
|
|
40 |
for uploaded_file in uploaded_files:
|
41 |
file_type = uploaded_file.type
|
42 |
file_suffix = '.' + uploaded_file.name.split('.')[-1]
|
@@ -46,6 +65,7 @@ if language:
|
|
46 |
text = textract.process(temp.name)
|
47 |
text = text.decode('utf-8')
|
48 |
doc = nlp(text)
|
|
|
49 |
ent_freq = Counter([ent.label_ for ent in doc.ents])
|
50 |
for key, value in ent_freq.items():
|
51 |
if st.sidebar.button(key):
|
@@ -57,6 +77,7 @@ if language:
|
|
57 |
except Exception as e:
|
58 |
st.error(e)
|
59 |
|
60 |
-
|
|
|
61 |
#st.download_button('Download', '', 'text/plain')
|
62 |
|
|
|
1 |
import streamlit as st
|
2 |
import textract
|
3 |
import tempfile
|
4 |
+
from typing import List
|
5 |
import spacy
|
6 |
from spacy.tokens import DocBin, Doc
|
7 |
from collections import Counter
|
8 |
import srsly
|
9 |
+
from spacy.matcher import Matcher
|
10 |
|
11 |
# Import CSS file
|
12 |
with open("style.css") as f:
|
|
|
20 |
spacy.cli.download(select_model)
|
21 |
return True
|
22 |
|
23 |
+
def search_docs(query:str, matcher:Matcher, documents:List[Doc], nlp, match_pattern:str=None):
|
24 |
+
qdoc = nlp(query)
|
25 |
+
if match_pattern:
|
26 |
+
pattern = match_pattern
|
27 |
+
else:
|
28 |
+
pattern = []
|
29 |
+
for token in qdoc:
|
30 |
+
print('token',token.text.lower())
|
31 |
+
pattern.append({"LOWER": token.text.lower()})
|
32 |
+
matcher.add(query, [pattern])
|
33 |
+
results = []
|
34 |
+
for doc in documents:
|
35 |
+
matches = matcher(doc) #List[(match_id, start, end)]
|
36 |
+
#print('matches',matches)
|
37 |
+
results.extend(matches)
|
38 |
+
return results
|
39 |
|
|
|
40 |
models = srsly.read_json('models.json')
|
41 |
models[''] = [] #require the user to choose a language
|
42 |
languages = models.keys()
|
|
|
51 |
nlp = spacy.load(select_model)
|
52 |
|
53 |
nlp.max_length = 1200000
|
54 |
+
matcher = Matcher(nlp.vocab)
|
55 |
|
56 |
uploaded_files = st.file_uploader("Select files to process", accept_multiple_files=True)
|
57 |
+
query = st.sidebar.text_input(label="Enter your query", value="...")
|
58 |
+
documents = []
|
59 |
for uploaded_file in uploaded_files:
|
60 |
file_type = uploaded_file.type
|
61 |
file_suffix = '.' + uploaded_file.name.split('.')[-1]
|
|
|
65 |
text = textract.process(temp.name)
|
66 |
text = text.decode('utf-8')
|
67 |
doc = nlp(text)
|
68 |
+
documents.append(doc)
|
69 |
ent_freq = Counter([ent.label_ for ent in doc.ents])
|
70 |
for key, value in ent_freq.items():
|
71 |
if st.sidebar.button(key):
|
|
|
77 |
except Exception as e:
|
78 |
st.error(e)
|
79 |
|
80 |
+
results = search_docs(query, matcher, documents,nlp)
|
81 |
+
st.write(results)
|
82 |
#st.download_button('Download', '', 'text/plain')
|
83 |
|