import streamlit as st import pandas as pd from sentence_transformers import SentenceTransformer from forgebox.cosine import CosineSearch import numpy as np from PIL import Image image = Image.open('shiji.png') st.markdown(""" ## 🍻 跨古/现代文搜索: 用白话搜史记 """) st.sidebar.image(image, use_column_width=True) st.sidebar.markdown(""" Search《Records of the Grand Historian》 with modern Chinese ### References * Model trained [here, please hit ⭐️](https://github.com/raynardj/yuan) * [Trained crossed language BERT](https://huggingface.co/raynardj/xlsearch-cross-lang-search-zh-vs-classicical-cn) ### Related projects * Read more [ancient books(almost all) with a translator](https://huggingface.co/spaces/raynardj/duguwen-classical-chinese-to-morden-translate) * [Modern Chines to classical Chinese translator](https://huggingface.co/spaces/raynardj/modern-chinese-to-ancient-translate-wenyanwen) """) TAG = "raynardj/xlsearch-cross-lang-search-zh-vs-classicical-cn" @st.cache(allow_output_mutation=True) def load_encoder(): with st.spinner(f"Loading Transformer:{TAG}"): encoder = SentenceTransformer(TAG) return encoder encoder = load_encoder() @st.cache(allow_output_mutation=True) def load_book(): with st.spinner(f"📚 Loading Book..."): df = pd.read_csv("grand_historian.csv") return list(df.sentences) all_lines = load_book() @st.cache(allow_output_mutation=True) def encode_book(): with st.spinner(f"Encoding sentences for book《Records of the Grand Historian》"): vec = np.load('vec.npy') cosine = CosineSearch(vec) return cosine cosine = encode_book() def search(text): enc = encoder.encode(text) # encode the search key order = cosine(enc) # distance array sentence_df = pd.DataFrame({"sentence":np.array(all_lines)[order[:5]]}) return sentence_df keyword = st.text_input("用白话搜", "") if st.button("搜索"): if keyword: with st.spinner(f"🔍 Searching for {keyword}"): df = search(keyword) st.table(df)