File size: 2,965 Bytes
966108f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import streamlit as st
from dotenv import load_dotenv
import json
import os, time
import uuid

from retrieval_pipeline import get_retriever, get_compression_retriever
import benchmark


def get_result(query, compression_retriever):
    t0 = time.time()
    retrieved_chunks = compression_retriever.get_relevant_documents(query)
    latency = time.time() - t0
    return retrieved_chunks, latency
 
st.set_page_config(
    layout="wide",
    page_title="Retrieval Demo"
)

def setup():
    load_dotenv()
    ELASTICSEARCH_URL = os.getenv('ELASTICSEARCH_URL')

    retriever = get_retriever(index='masa.ai', elasticsearch_url=ELASTICSEARCH_URL)
    compression_retriever = get_compression_retriever(retriever)
    return compression_retriever


def main():
    st.title("Part 3: Search")
    # st.sidebar.write("According to the Model Size 👇")
    # menu = ["Nano", "Small", "Medium", "Large"]
    # choice = st.sidebar.selectbox("Choose", menu)

    st.sidebar.info("""
**Model Options:**
- **Nano**: ~4MB, blazing fast model with competitive performance (ranking precision).
- **Small**: ~34MB, slightly slower with the best performance (ranking precision).
- **Medium**: ~110MB, slower model with the best zero-shot performance (ranking precision).
- **Large**: ~150MB, slower model with competitive performance (ranking precision) for 100+ languages.
""")

    with st.spinner('Setting up...'):
        compression_retriever = setup()

    with st.expander("Tech Stack Used"):
        st.markdown("""
        **Flash Rank**: Ultra-lite & Super-fast Python library for search & retrieval re-ranking.

        - **Ultra-lite**: No heavy dependencies. Runs on CPU with a tiny ~4MB reranking model.
        - **Super-fast**: Speed depends on the number of tokens in passages and query, plus model depth.
        - **Cost-efficient**: Ideal for serverless deployments with low memory and time requirements.
        - **Based on State-of-the-Art Cross-encoders**: Includes models like ms-marco-TinyBERT-L-2-v2 (default), ms-marco-MiniLM-L-12-v2, rank-T5-flan, and ms-marco-MultiBERT-L-12.
        - **Sleek Models for Efficiency**: Designed for minimal overhead in user-facing scenarios.

        _Flash Rank is tailored for scenarios requiring efficient and effective reranking, balancing performance with resource usage._
        """)


    with st.form(key='input_form'):
        query_input = st.text_area("Query Input")
        # context_input = st.text_area("Context Input")
        submit_button = st.form_submit_button(label='Retrieve')
    
    if submit_button:
        st.session_state.submitted = True

    if 'submitted' in st.session_state:
        with st.spinner('Processing...'):
            result, latency = get_result(query_input, compression_retriever)
            st.subheader("Please find the retrieved documents below 👇")
            st.write("latency:", latency, " ms")
            st.json(result)



if __name__ == "__main__":
    main()