raynardj commited on
Commit
67eeae3
1 Parent(s): 5e8b453

👜 baseline

Browse files
Files changed (4) hide show
  1. README.md +5 -29
  2. app.py +45 -0
  3. grand_historian.csv +0 -0
  4. requirements.txt +5 -0
README.md CHANGED
@@ -1,37 +1,13 @@
1
  ---
2
- title: X Language Search Ancient With Modern Words
3
- emoji: 🐠
4
- colorFrom: purple
5
  colorTo: purple
6
  sdk: streamlit
7
  app_file: app.py
8
  pinned: false
9
  ---
 
 
10
 
11
- # Configuration
12
 
13
- `title`: _string_
14
- Display title for the Space
15
-
16
- `emoji`: _string_
17
- Space emoji (emoji-only character allowed)
18
-
19
- `colorFrom`: _string_
20
- Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
21
-
22
- `colorTo`: _string_
23
- Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
24
-
25
- `sdk`: _string_
26
- Can be either `gradio`, `streamlit`, or `static`
27
-
28
- `sdk_version` : _string_
29
- Only applicable for `streamlit` SDK.
30
- See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
31
-
32
- `app_file`: _string_
33
- Path to your main application file (which contains either `gradio` or `streamlit` Python code, or `static` html code).
34
- Path is relative to the root of the repository.
35
-
36
- `pinned`: _boolean_
37
- Whether the Space stays on top of your list.
1
  ---
2
+ title: Cross language search
3
+ emoji: ⚔️
4
+ colorFrom: indigo
5
  colorTo: purple
6
  sdk: streamlit
7
  app_file: app.py
8
  pinned: false
9
  ---
10
+ # Cross Language Search
11
+ > Search ancient books with modern words
12
 
 
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from sentence_transformers import SentenceTransformer
4
+ from forgebox.cosine import CosineSearch
5
+ import numpy as np
6
+
7
+ TAG = "raynardj/xlsearch-cross-lang-search-zh-vs-classicical-cn"
8
+
9
+ @st.cache(allow_output_mutation=True)
10
+ def load_encoder():
11
+ with st.spinner(f"Loading Transformer:{TAG}"):
12
+ encoder = SentenceTransformer(TAG)
13
+ return encoder
14
+
15
+ encoder = load_encoder()
16
+
17
+ @st.cache(allow_output_mutation=True)
18
+ def load_book():
19
+ with st.spinner(f"📚 Loading Book..."):
20
+ df = pd.read_csv("grand_historian.csv")
21
+ return list(df.sentence)
22
+
23
+ all_lines = load_book()
24
+
25
+ @st.cache(allow_output_mutation=True)
26
+ def encode_book():
27
+ with st.spinner(f"Encoding sentences for book《Records of the Grand Historian》"):
28
+ vec = encoder.encode(all_lines, batch_size=64, show_progress_bar=True)
29
+ cosine = CosineSearch(vec)
30
+ return cosine
31
+
32
+ cosine = encode_book()
33
+
34
+ def search(text):
35
+ enc = encoder.encode(text) # encode the search key
36
+ order = cosine(enc) # distance array
37
+ sentence_df = pd.DataFrame({"sentence":np.array(all_lines)[order[:5]]})
38
+ return sentence_df
39
+
40
+ keyword = st.text_input("用白话搜", "")
41
+ if st.button("搜索"):
42
+ if keyword:
43
+ with st.spinner(f"🔍 Searching for {keyword}"):
44
+ df = search(keyword)
45
+ st.table(df)
grand_historian.csv ADDED
The diff for this file is too large to render. See raw diff
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
1
+ torch==1.7.1
2
+ sentence-transformers==2.1.0
3
+ transformers==4.12.3
4
+ pandas==1.3.5
5
+ forgebox==0.4.20