apjanco commited on
Commit
9496b6e
1 Parent(s): b16d274

first steps on search

Browse files
Files changed (2) hide show
  1. .gitignore +1 -0
  2. app.py +24 -3
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ venv/
app.py CHANGED
@@ -1,10 +1,12 @@
1
  import streamlit as st
2
  import textract
3
  import tempfile
 
4
  import spacy
5
  from spacy.tokens import DocBin, Doc
6
  from collections import Counter
7
  import srsly
 
8
 
9
  # Import CSS file
10
  with open("style.css") as f:
@@ -18,8 +20,23 @@ def download_model(select_model:str):
18
  spacy.cli.download(select_model)
19
  return True
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
- doc_bin = DocBin()
23
  models = srsly.read_json('models.json')
24
  models[''] = [] #require the user to choose a language
25
  languages = models.keys()
@@ -34,9 +51,11 @@ if language:
34
  nlp = spacy.load(select_model)
35
 
36
  nlp.max_length = 1200000
 
37
 
38
  uploaded_files = st.file_uploader("Select files to process", accept_multiple_files=True)
39
- search = st.sidebar.text_input(label="Enter your query", value="...")
 
40
  for uploaded_file in uploaded_files:
41
  file_type = uploaded_file.type
42
  file_suffix = '.' + uploaded_file.name.split('.')[-1]
@@ -46,6 +65,7 @@ if language:
46
  text = textract.process(temp.name)
47
  text = text.decode('utf-8')
48
  doc = nlp(text)
 
49
  ent_freq = Counter([ent.label_ for ent in doc.ents])
50
  for key, value in ent_freq.items():
51
  if st.sidebar.button(key):
@@ -57,6 +77,7 @@ if language:
57
  except Exception as e:
58
  st.error(e)
59
 
60
-
 
61
  #st.download_button('Download', '', 'text/plain')
62
 
 
1
  import streamlit as st
2
  import textract
3
  import tempfile
4
+ from typing import List
5
  import spacy
6
  from spacy.tokens import DocBin, Doc
7
  from collections import Counter
8
  import srsly
9
+ from spacy.matcher import Matcher
10
 
11
  # Import CSS file
12
  with open("style.css") as f:
 
20
  spacy.cli.download(select_model)
21
  return True
22
 
23
+ def search_docs(query:str, matcher:Matcher, documents:List[Doc], nlp, match_pattern:str=None):
24
+ qdoc = nlp(query)
25
+ if match_pattern:
26
+ pattern = match_pattern
27
+ else:
28
+ pattern = []
29
+ for token in qdoc:
30
+ print('token',token.text.lower())
31
+ pattern.append({"LOWER": token.text.lower()})
32
+ matcher.add(query, [pattern])
33
+ results = []
34
+ for doc in documents:
35
+ matches = matcher(doc) #List[(match_id, start, end)]
36
+ #print('matches',matches)
37
+ results.extend(matches)
38
+ return results
39
 
 
40
  models = srsly.read_json('models.json')
41
  models[''] = [] #require the user to choose a language
42
  languages = models.keys()
 
51
  nlp = spacy.load(select_model)
52
 
53
  nlp.max_length = 1200000
54
+ matcher = Matcher(nlp.vocab)
55
 
56
  uploaded_files = st.file_uploader("Select files to process", accept_multiple_files=True)
57
+ query = st.sidebar.text_input(label="Enter your query", value="...")
58
+ documents = []
59
  for uploaded_file in uploaded_files:
60
  file_type = uploaded_file.type
61
  file_suffix = '.' + uploaded_file.name.split('.')[-1]
 
65
  text = textract.process(temp.name)
66
  text = text.decode('utf-8')
67
  doc = nlp(text)
68
+ documents.append(doc)
69
  ent_freq = Counter([ent.label_ for ent in doc.ents])
70
  for key, value in ent_freq.items():
71
  if st.sidebar.button(key):
 
77
  except Exception as e:
78
  st.error(e)
79
 
80
+ results = search_docs(query, matcher, documents,nlp)
81
+ st.write(results)
82
  #st.download_button('Download', '', 'text/plain')
83