File size: 1,343 Bytes
67eeae3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import streamlit as st
import pandas as pd
from sentence_transformers import SentenceTransformer
from forgebox.cosine import CosineSearch
import numpy as np

TAG = "raynardj/xlsearch-cross-lang-search-zh-vs-classicical-cn"

@st.cache(allow_output_mutation=True)
def load_encoder():
    with st.spinner(f"Loading Transformer:{TAG}"):
        encoder = SentenceTransformer(TAG)
    return encoder

encoder = load_encoder()

@st.cache(allow_output_mutation=True)
def load_book():
    with st.spinner(f"📚 Loading Book..."):
        df = pd.read_csv("grand_historian.csv")
    return list(df.sentence)

all_lines = load_book()

@st.cache(allow_output_mutation=True)
def encode_book():
    with st.spinner(f"Encoding sentences for book《Records of the Grand Historian》"):
        vec = encoder.encode(all_lines, batch_size=64, show_progress_bar=True)
        cosine = CosineSearch(vec)
    return cosine

cosine = encode_book()

def search(text):
    enc = encoder.encode(text) # encode the search key
    order = cosine(enc) # distance array
    sentence_df = pd.DataFrame({"sentence":np.array(all_lines)[order[:5]]})
    return sentence_df

keyword = st.text_input("用白话搜", "")
if st.button("搜索"):
    if keyword:
        with st.spinner(f"🔍 Searching for {keyword}"):
            df = search(keyword)
            st.table(df)