raynardj's picture
load from encoded
cfa2da9
import streamlit as st
import pandas as pd
from sentence_transformers import SentenceTransformer
from forgebox.cosine import CosineSearch
import numpy as np
from PIL import Image
image = Image.open('shiji.png')
st.markdown("""
## 🍻 跨古/现代文搜索: 用白话搜史记
""")
st.sidebar.image(image, use_column_width=True)
st.sidebar.markdown("""
Search《Records of the Grand Historian》 with modern Chinese
### References
* Model trained [here, please hit ⭐️](https://github.com/raynardj/yuan)
* [Trained crossed language BERT](https://huggingface.co/raynardj/xlsearch-cross-lang-search-zh-vs-classicical-cn)
### Related projects
* Read more [ancient books(almost all) with a translator](https://huggingface.co/spaces/raynardj/duguwen-classical-chinese-to-morden-translate)
* [Modern Chines to classical Chinese translator](https://huggingface.co/spaces/raynardj/modern-chinese-to-ancient-translate-wenyanwen)
""")
TAG = "raynardj/xlsearch-cross-lang-search-zh-vs-classicical-cn"
@st.cache(allow_output_mutation=True)
def load_encoder():
with st.spinner(f"Loading Transformer:{TAG}"):
encoder = SentenceTransformer(TAG)
return encoder
encoder = load_encoder()
@st.cache(allow_output_mutation=True)
def load_book():
with st.spinner(f"📚 Loading Book..."):
df = pd.read_csv("grand_historian.csv")
return list(df.sentences)
all_lines = load_book()
@st.cache(allow_output_mutation=True)
def encode_book():
with st.spinner(f"Encoding sentences for book《Records of the Grand Historian》"):
vec = np.load('vec.npy')
cosine = CosineSearch(vec)
return cosine
cosine = encode_book()
def search(text):
enc = encoder.encode(text) # encode the search key
order = cosine(enc) # distance array
sentence_df = pd.DataFrame({"sentence":np.array(all_lines)[order[:5]]})
return sentence_df
keyword = st.text_input("用白话搜", "")
if st.button("搜索"):
if keyword:
with st.spinner(f"🔍 Searching for {keyword}"):
df = search(keyword)
st.table(df)