import streamlit as st import pandas as pd from sentence_transformers import SentenceTransformer from transformers import CrossEncoder import numpy as np # Load the dataset def load_dataset(): # Load the Databricks Dolly 15K dataset return pd.read_csv('dolly_15k.csv') # Load models embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') ranking_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2') # Streamlit UI st.title("Multi-Stage Text Retrieval Pipeline for QA") question = st.text_input("Enter a question:") if question: dataset = load_dataset() # Generate embeddings for the questions and the dataset passages passages = dataset['response'].tolist() # Adjust this according to your dataset's structure question_embedding = embedding_model.encode(question) passage_embeddings = embedding_model.encode(passages) # Retrieve top-k passages based on embeddings top_k = 5 similarities = np.inner(question_embedding, passage_embeddings) top_k_indices = np.argsort(similarities)[-top_k:][::-1] relevant_passages = [passages[i] for i in top_k_indices] st.subheader("Relevant passages:") for passage in relevant_passages: st.write(passage) # Re-ranking the passages ranked_scores = ranking_model.predict([[question, passage] for passage in relevant_passages]) ranked_passages = sorted(zip(relevant_passages, ranked_scores), key=lambda x: x[1], reverse=True) st.subheader("Ranked passages:") for passage, score in ranked_passages: st.write(f"{passage} (Score: {score:.2f})")