File size: 2,343 Bytes
19d508b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import streamlit as st
import pickle
import os

# Load the CSV data into a DataFrame
df = pd.read_csv("/content/Hydra-Movie-Scrape.csv")

# Load the precomputed embeddings
with open("/content/embeddings.pkl", "rb") as f:
    doc_embeddings = pickle.load(f)

# Convert the embeddings into a NumPy array (FAISS requires float32)
embedding_matrix = np.array(doc_embeddings).astype("float32")

# Build a FAISS index for efficient similarity search
index = faiss.IndexFlatL2(embedding_matrix.shape[1])  # L2 distance for FAISS
index.add(embedding_matrix)

# Load the SentenceTransformer model for encoding the query
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to retrieve the most relevant movies based on a query
def retrieve(query, top_k=10):
    """Retrieve the top_k most relevant movies based on the query."""
    query_embedding = model.encode(query)  # Encode the query to get its embedding
    query_vector = np.array(query_embedding).astype("float32")
    distances, indices = index.search(np.array([query_vector]), top_k)
    return indices[0]  # Return the indices of the most relevant documents

# Streamlit app layout
st.title("Movie Dataset RAG Application")
query = st.text_input("Ask a question about movies:")

if st.button("Submit"):
    if query:
        # Retrieve the most relevant documents (movies) based on the query
        indices = retrieve(query)

        # Display the results
        response = ""
        for idx in indices:
            if idx != -1:  # Check if the index is valid
                movie_details = df.iloc[idx]
                response += f"*Title*: {movie_details['Title']}\n"
                response += f"*Year*: {movie_details['Year']}\n"
                response += f"*Director*: {movie_details['Director']}\n"
                response += f"*Cast*: {movie_details['Cast']}\n"
                response += f"*Summary*: {movie_details['Summary']}\n\n"
        
        # Output the response
        if response:
            st.write("Here are some movies that match your query:")
            st.markdown(response)  # Use markdown to format the output nicely
        else:
            st.write("No relevant documents found.")
    else:
        st.write("Please enter a query.")