import streamlit as st import pandas as pd from sentence_transformers import SentenceTransformer from forgebox.cosine import CosineSearch import numpy as np TAG = "raynardj/xlsearch-cross-lang-search-zh-vs-classicical-cn" @st.cache(allow_output_mutation=True) def load_encoder(): with st.spinner(f"Loading Transformer:{TAG}"): encoder = SentenceTransformer(TAG) return encoder encoder = load_encoder() @st.cache(allow_output_mutation=True) def load_book(): with st.spinner(f"📚 Loading Book..."): df = pd.read_csv("grand_historian.csv") return list(df.sentence) all_lines = load_book() @st.cache(allow_output_mutation=True) def encode_book(): with st.spinner(f"Encoding sentences for book《Records of the Grand Historian》"): vec = encoder.encode(all_lines, batch_size=64, show_progress_bar=True) cosine = CosineSearch(vec) return cosine cosine = encode_book() def search(text): enc = encoder.encode(text) # encode the search key order = cosine(enc) # distance array sentence_df = pd.DataFrame({"sentence":np.array(all_lines)[order[:5]]}) return sentence_df keyword = st.text_input("用白话搜", "") if st.button("搜索"): if keyword: with st.spinner(f"🔍 Searching for {keyword}"): df = search(keyword) st.table(df)