eduardofv commited on
Commit
feaaa7e
1 Parent(s): 31e00f2

migrated from test space

Browse files
Files changed (4) hide show
  1. README.md +4 -4
  2. app.py +63 -0
  3. requirements.txt +2 -0
  4. titles-simple-0.pt +3 -0
README.md CHANGED
@@ -1,13 +1,13 @@
1
  ---
2
- title: Multilang_semantic_search_wikisimple
3
- emoji: 🚀
4
- colorFrom: yellow
5
  colorTo: blue
6
  sdk: streamlit
7
  sdk_version: 1.2.0
8
  app_file: app.py
9
  pinned: false
10
- license: apache-2.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
1
  ---
2
+ title: Test_space
3
+ emoji: 🔥
4
+ colorFrom: green
5
  colorTo: blue
6
  sdk: streamlit
7
  sdk_version: 1.2.0
8
  app_file: app.py
9
  pinned: false
10
+ license: lgpl-3.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
app.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ import torch
4
+ import sentence_transformers as sent
5
+ import datasets as ds
6
+
7
+ d = ds.load_dataset("wikipedia", "20220301.simple")
8
+ t = d["train"]
9
+ titles = t['title']
10
+
11
+ @st.cache(allow_output_mutation=True)
12
+ def load_model():
13
+ return sent.SentenceTransformer("distiluse-base-multilingual-cased-v1")#"all-MiniLM-L6-v2")
14
+
15
+ @st.cache
16
+ def load_wikipedia_embeddings():
17
+ return torch.load("titles-simple-0.pt", map_location=torch.device('cpu'))
18
+
19
+
20
+ st.title("Multilingual Semantic Search for Wikipedia Simple English")
21
+ st.markdown("""
22
+ Use semantic search to find related articles in Wikipedia Simple English: using a language model (sentence-transformers/distiluse-base-multilingual-cased-v1) we can find the closests titles from Wikipedia Simple English (wikipedia) queried in any of the model's trained languages: Arabic, Chinese, Dutch, English, French, German, Italian, Korean, Polish, Portuguese, Russian, Spanish, Turkish:
23
+
24
+
25
+ - colesterol
26
+ - développement humain
27
+ - Crise dos mísseis de Cuba
28
+
29
+
30
+ Also, "near natural language" queries are usually enough to bring up relevant results. Try:
31
+
32
+
33
+ - ¿cuál es el edificio más alto del mundo?
34
+ - comment préparer du poulet frit
35
+ - melhores películas de pixar
36
+
37
+
38
+ (note: search is done only on the article titles, not the content)
39
+ """)
40
+ model = load_model()
41
+ embeddings = load_wikipedia_embeddings()
42
+
43
+ #queries = ["Aristoteles", "Autismo", "Mental", "crecimiento poblacional"]
44
+ query = st.text_input("Query (es, fr, pt, ...)")
45
+
46
+ if query != "":
47
+ queries = [query]
48
+ queries_emb = model.encode(queries, convert_to_tensor=True)
49
+
50
+ hits = sent.util.semantic_search(queries_emb, embeddings, top_k=5)
51
+
52
+ for i,q in enumerate(queries):
53
+ f"----\n{q}:\n"
54
+ for hit in hits[i]:
55
+ cid = hit['corpus_id']
56
+ title = titles[cid]
57
+ url = t[cid]['url']
58
+ text = t[cid]['text'][:500] + "..."
59
+ st.header(f"{title}")
60
+ url
61
+ text
62
+ hit
63
+
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
1
+ torch
2
+ sentence-transformers
titles-simple-0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c2625c3dc72d3df79f6d8491915fe7207113ee140cc9cf561df48465e63f9ec
3
+ size 420512491