|
import json |
|
import numpy as np |
|
from wordllama import WordLlama |
|
import gradio as gr |
|
from numpy.linalg import norm |
|
import os |
|
|
|
|
|
with open('processed_episodes.json', 'r') as f: |
|
episodes = json.load(f) |
|
|
|
|
|
wl = WordLlama.load(trunc_dim=256) |
|
|
|
|
|
if os.path.exists('summary_embeddings.npy'): |
|
|
|
summary_embeddings = np.load('summary_embeddings.npy') |
|
else: |
|
|
|
summaries = [episode['summary'] for episode in episodes] |
|
summary_embeddings = wl.embed(summaries) |
|
summary_embeddings = np.array(summary_embeddings) |
|
|
|
np.save('summary_embeddings.npy', summary_embeddings) |
|
|
|
|
|
def find_matching_episodes(query, top_k=5): |
|
|
|
query_embedding = wl.embed([query])[0] |
|
|
|
|
|
query_norm = query_embedding / (norm(query_embedding) + 1e-10) |
|
summaries_norm = summary_embeddings / (norm(summary_embeddings, axis=1, keepdims=True) + 1e-10) |
|
|
|
|
|
similarities = summaries_norm @ query_norm |
|
|
|
|
|
top_k_indices = np.argsort(similarities)[-top_k:][::-1] |
|
|
|
|
|
matching_episodes = [] |
|
for idx in top_k_indices: |
|
episode = episodes[idx] |
|
similarity_score = similarities[idx] |
|
|
|
result = [ |
|
episode['episode_number'], |
|
episode['title'], |
|
f"{similarity_score:.4f}" |
|
] |
|
matching_episodes.append(result) |
|
|
|
return matching_episodes |
|
|
|
|
|
|
|
interface = gr.Interface( |
|
fn=find_matching_episodes, |
|
inputs=[ |
|
gr.Textbox(lines=2, placeholder="Enter your query here...", label="Search Query"), |
|
gr.Slider(minimum=1, maximum=10, value=5, label="Number of Results") |
|
], |
|
outputs=gr.Dataframe( |
|
headers=["Episode Number", "Title", "Similarity Score"], |
|
label="Matching Episodes" |
|
), |
|
title="Picardle", |
|
description="Enter a query to find matching ST:TNG episodes based on their summaries." |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
interface.launch(share=True) |
|
|
|
|