Spaces:
Runtime error
Runtime error
import pandas as pd | |
import tiktoken | |
import os | |
import openai | |
from openai.embeddings_utils import get_embedding, cosine_similarity | |
import numpy as np | |
import streamlit as st | |
input_datapath = "fine_food_reviews_with_embeddings_1k.csv" | |
df = pd.read_csv(input_datapath, index_col=0) | |
#os.environ["OPENAI_API_KEY"] = st.secrets("OPENAI_API_KEY") | |
#openai.api_key = st.secrets("OPENAI_API_KEY") | |
st.title("Semantic Search") | |
#adding another column having the summary as title and the actual text as content | |
df["combined"] = ( | |
"Title: " + df.Summary.str.strip() + "; Content: " + df.Text.str.strip() | |
) | |
# embedding model parameters | |
embedding_model = "text-embedding-ada-002" | |
embedding_encoding = "cl100k_base" # this the encoding for text-embedding-ada-002 | |
max_tokens = 8000 # the maximum for text-embedding-ada-002 is 8191 | |
encoding = tiktoken.get_encoding(embedding_encoding) | |
top_n = 500 | |
# omit reviews that are too long to embed | |
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x))) | |
df = df[df.n_tokens <= max_tokens].tail(top_n) | |
datafile_path = "fine_food_reviews_with_embeddings_1k.csv" | |
df = pd.read_csv(datafile_path) | |
df["embedding"] = df.embedding.apply(eval).apply(np.array) | |
# search through the reviews for a specific product | |
def search_reviews(df, product_description, n=3, pprint=True): | |
product_embedding = get_embedding( | |
product_description, | |
engine="text-embedding-ada-002" | |
) | |
df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding)) | |
results = ( | |
df.sort_values("similarity", ascending=False) | |
.head(n) | |
.combined.str.replace("Title: ", "") | |
.str.replace("; Content:", ": ") | |
) | |
product = ( | |
df.sort_values("similarity", ascending=False) | |
.head(n) | |
.ProductId | |
) | |
if pprint: | |
for r in range(n): | |
idx = results.index[r] | |
print("Product : ",product[idx]) | |
print(results[idx]) | |
print() | |
return results,product | |
prompt = st.text_input("What do you want to search for? : ","pizza") | |
top_n = st.number_input("How many results do you want to see? : ", min_value = 1) | |
results,product = search_reviews(df, prompt, top_n) | |
if st.button("Search Reviews"): | |
st.write(product,results) | |