m-butler commited on
Commit
f266d24
1 Parent(s): 6efdf97

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. README.md +2 -8
  2. app.py +76 -0
  3. processed_episodes.json +0 -0
  4. summary_embeddings.npy +3 -0
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: Picardle
3
- emoji: 👁
4
- colorFrom: red
5
- colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 4.44.0
8
- app_file: app.py
9
- pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: picardle
3
+ app_file: app.py
 
 
4
  sdk: gradio
5
  sdk_version: 4.44.0
 
 
6
  ---
 
 
app.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import numpy as np
3
+ from wordllama import WordLlama
4
+ import gradio as gr
5
+ from numpy.linalg import norm
6
+ import os
7
+
8
+ # Load episodes data
9
+ with open('processed_episodes.json', 'r') as f:
10
+ episodes = json.load(f)
11
+
12
+ # Load the WordLlama model with 256 dimensions
13
+ wl = WordLlama.load(trunc_dim=256)
14
+
15
+ # Check if embeddings are already cached
16
+ if os.path.exists('summary_embeddings.npy'):
17
+ # Load embeddings from cache
18
+ summary_embeddings = np.load('summary_embeddings.npy')
19
+ else:
20
+ # Compute embeddings for all summaries
21
+ summaries = [episode['summary'] for episode in episodes]
22
+ summary_embeddings = wl.embed(summaries)
23
+ summary_embeddings = np.array(summary_embeddings)
24
+ # Save embeddings to cache
25
+ np.save('summary_embeddings.npy', summary_embeddings)
26
+
27
+ # Define the function to find matching episodes
28
+ def find_matching_episodes(query, top_k=5):
29
+ # Compute the embedding for the query
30
+ query_embedding = wl.embed([query])[0] # The result is a 1D numpy array
31
+
32
+ # Normalize embeddings
33
+ query_norm = query_embedding / (norm(query_embedding) + 1e-10)
34
+ summaries_norm = summary_embeddings / (norm(summary_embeddings, axis=1, keepdims=True) + 1e-10)
35
+
36
+ # Compute cosine similarities
37
+ similarities = summaries_norm @ query_norm
38
+
39
+ # Get indices of the top_k most similar summaries
40
+ top_k_indices = np.argsort(similarities)[-top_k:][::-1]
41
+
42
+ # Retrieve the matching episodes and their similarity scores
43
+ matching_episodes = []
44
+ for idx in top_k_indices:
45
+ episode = episodes[idx]
46
+ similarity_score = similarities[idx]
47
+ # Create a list of values instead of a dictionary
48
+ result = [
49
+ episode['episode_number'],
50
+ episode['title'],
51
+ f"{similarity_score:.4f}"
52
+ ]
53
+ matching_episodes.append(result)
54
+
55
+ return matching_episodes
56
+
57
+ # Create the Gradio interface
58
+ # Create the Gradio interface
59
+ interface = gr.Interface(
60
+ fn=find_matching_episodes,
61
+ inputs=[
62
+ gr.Textbox(lines=2, placeholder="Enter your query here...", label="Search Query"),
63
+ gr.Slider(minimum=1, maximum=10, value=5, label="Number of Results")
64
+ ],
65
+ outputs=gr.Dataframe(
66
+ headers=["Episode Number", "Title", "Similarity Score"],
67
+ label="Matching Episodes"
68
+ ),
69
+ title="Picardle",
70
+ description="Enter a query to find matching ST:TNG episodes based on their summaries."
71
+ )
72
+
73
+ # Launch the app
74
+ if __name__ == "__main__":
75
+ interface.launch(share=True)
76
+
processed_episodes.json ADDED
The diff for this file is too large to render. See raw diff
 
summary_embeddings.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb345516827ee150c38b8785b8cc26c7b65f95cb98b90160635e009addf59751
3
+ size 182400