Spaces:

apjanco
/

bolete

Build error

App Files Files Community

apjanco commited on Aug 10, 2022

Commit

9496b6e

•

1 Parent(s): b16d274

first steps on search

Browse files

Files changed (2) hide show

.gitignore +1 -0
app.py +24 -3

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ venv/

app.py CHANGED Viewed

@@ -1,10 +1,12 @@
 import streamlit as st
 import textract
 import tempfile
 import spacy
 from spacy.tokens import DocBin, Doc
 from collections import Counter
 import srsly
 # Import CSS file
 with open("style.css") as f:
@@ -18,8 +20,23 @@ def download_model(select_model:str):
         spacy.cli.download(select_model)
     return True
-doc_bin = DocBin()
 models = srsly.read_json('models.json')
 models[''] = [] #require the user to choose a language
 languages = models.keys()
@@ -34,9 +51,11 @@ if language:
             nlp = spacy.load(select_model)
             nlp.max_length = 1200000
             uploaded_files = st.file_uploader("Select files to process", accept_multiple_files=True)
-            search = st.sidebar.text_input(label="Enter your query", value="...")
             for uploaded_file in uploaded_files:
                 file_type = uploaded_file.type
                 file_suffix = '.' + uploaded_file.name.split('.')[-1]
@@ -46,6 +65,7 @@ if language:
                     text = textract.process(temp.name)
                     text = text.decode('utf-8')
                     doc = nlp(text)
                     ent_freq = Counter([ent.label_ for ent in doc.ents])
                     for key, value in ent_freq.items():
                         if st.sidebar.button(key):
@@ -57,6 +77,7 @@ if language:
                 except Exception as e:
                     st.error(e)
             #st.download_button('Download', '', 'text/plain')

 import streamlit as st
 import textract
 import tempfile
+from typing import List
 import spacy
 from spacy.tokens import DocBin, Doc
 from collections import Counter
 import srsly
+from spacy.matcher import Matcher
 # Import CSS file
 with open("style.css") as f:
         spacy.cli.download(select_model)
     return True
+def search_docs(query:str, matcher:Matcher, documents:List[Doc], nlp, match_pattern:str=None):
+    qdoc = nlp(query)
+    if match_pattern:
+        pattern = match_pattern
+    else:
+        pattern = []
+        for token in qdoc:
+            print('token',token.text.lower())
+            pattern.append({"LOWER": token.text.lower()})
+    matcher.add(query, [pattern])
+    results = []
+    for doc in documents:
+        matches = matcher(doc) #List[(match_id, start, end)]
+        #print('matches',matches)
+        results.extend(matches)
+    return results
 models = srsly.read_json('models.json')
 models[''] = [] #require the user to choose a language
 languages = models.keys()
             nlp = spacy.load(select_model)
             nlp.max_length = 1200000
+            matcher = Matcher(nlp.vocab)
             uploaded_files = st.file_uploader("Select files to process", accept_multiple_files=True)
+            query = st.sidebar.text_input(label="Enter your query", value="...")
+            documents = []
             for uploaded_file in uploaded_files:
                 file_type = uploaded_file.type
                 file_suffix = '.' + uploaded_file.name.split('.')[-1]
                     text = textract.process(temp.name)
                     text = text.decode('utf-8')
                     doc = nlp(text)
+                    documents.append(doc)
                     ent_freq = Counter([ent.label_ for ent in doc.ents])
                     for key, value in ent_freq.items():
                         if st.sidebar.button(key):
                 except Exception as e:
                     st.error(e)
+            results = search_docs(query, matcher, documents,nlp)
+            st.write(results)
             #st.download_button('Download', '', 'text/plain')