Tesneem commited on
Commit
61804bb
·
verified ·
1 Parent(s): 785deee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -12
app.py CHANGED
@@ -9,14 +9,34 @@ from datasets import load_dataset
9
  # Load pre-trained SentenceTransformer model
10
  embedding_model = SentenceTransformer("thenlper/gte-large")
11
 
12
- # Example dataset with genres (replace with your actual data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  dataset = load_dataset("hugginglearners/netflix-shows")
14
- dataset = dataset.filter(lambda x: x['description'] is not None and x['listed_in'] is not None and x['title'] is not None)
15
- data = dataset['train'] # Accessing the 'train' split of the dataset
16
 
17
- # Convert the dataset to a list of dictionaries for easier indexing
18
- data_list = list[data]
19
- print(data_list)
20
  # Combine description and genre for embedding
21
  def combine_description_title_and_genre(description, listed_in, title):
22
  return f"{description} Genre: {listed_in} Title: {title}"
@@ -29,25 +49,33 @@ def get_embedding(text):
29
  def vector_search(query):
30
  query_embedding = get_embedding(query)
31
 
32
- # Generate embeddings for the combined description and genre
33
- embeddings = np.array([get_embedding(combine_description_title_and_genre(item["description"], item["listed_in"],item["title"])) for item in data_list[0]])
 
 
 
 
 
 
 
 
 
34
 
35
  # Calculate cosine similarity between the query and all embeddings
36
  similarities = cosine_similarity([query_embedding], embeddings)
37
-
38
  # # Adjust similarity scores based on ratings
39
  # ratings = np.array([item["rating"] for item in data_list])
40
  # adjusted_similarities = similarities * ratings.reshape(-1, 1)
41
 
42
- # Get top N most similar items (e.g., top 3)
43
  top_n = 3
44
  top_indices = similarities[0].argsort()[-top_n:][::-1] # Get indices of the top N results
45
- top_items = [data_list[i] for i in top_indices]
46
 
47
  # Format the output for display
48
  search_result = ""
49
  for item in top_items:
50
- search_result += f"Title: {item['title']}, Description: {item['description']}, Genre: {item['listed_in']}, Rating: {item['rating']}\n"
51
 
52
  return search_result
53
 
 
9
  # Load pre-trained SentenceTransformer model
10
  embedding_model = SentenceTransformer("thenlper/gte-large")
11
 
12
+ # # Example dataset with genres (replace with your actual data)
13
+ # dataset = load_dataset("hugginglearners/netflix-shows")
14
+ # dataset = dataset.filter(lambda x: x['description'] is not None and x['listed_in'] is not None and x['title'] is not None)
15
+ # data = dataset['train'] # Accessing the 'train' split of the dataset
16
+
17
+ # # Convert the dataset to a list of dictionaries for easier indexing
18
+ # data_list = list[data]
19
+ # print(data_list)
20
+ # # Combine description and genre for embedding
21
+ # def combine_description_title_and_genre(description, listed_in, title):
22
+ # return f"{description} Genre: {listed_in} Title: {title}"
23
+
24
+ # # Generate embedding for the query
25
+ # def get_embedding(text):
26
+ # return embedding_model.encode(text)
27
+
28
+ # # Vector search function
29
+ # def vector_search(query):
30
+ # query_embedding = get_embedding(query)
31
+
32
+ # # Generate embeddings for the combined description and genre
33
+ # embeddings = np.array([get_embedding(combine_description_title_and_genre(item["description"], item["listed_in"],item["title"])) for item in data_list[0]])
34
+
35
+ # # Calculate cosine similarity between the query and all embeddings
36
+ # similarities = cosine_similarity([query_embedding], embeddings)
37
+ # Load dataset (using the correct dataset identifier for your case)
38
  dataset = load_dataset("hugginglearners/netflix-shows")
 
 
39
 
 
 
 
40
  # Combine description and genre for embedding
41
  def combine_description_title_and_genre(description, listed_in, title):
42
  return f"{description} Genre: {listed_in} Title: {title}"
 
49
  def vector_search(query):
50
  query_embedding = get_embedding(query)
51
 
52
+ # Function to generate embeddings for each item in the dataset
53
+ def generate_embeddings(example):
54
+ return {
55
+ 'embedding': get_embedding(combine_description_title_and_genre(example["description"], example["listed_in"], example["title"]))
56
+ }
57
+
58
+ # Generate embeddings for the dataset using map
59
+ embeddings_dataset = dataset["train"].map(generate_embeddings)
60
+
61
+ # Extract embeddings
62
+ embeddings = np.array([embedding['embedding'] for embedding in embeddings_dataset])
63
 
64
  # Calculate cosine similarity between the query and all embeddings
65
  similarities = cosine_similarity([query_embedding], embeddings)
 
66
  # # Adjust similarity scores based on ratings
67
  # ratings = np.array([item["rating"] for item in data_list])
68
  # adjusted_similarities = similarities * ratings.reshape(-1, 1)
69
 
70
+ # Get top N most similar items (e.g., top 3)
71
  top_n = 3
72
  top_indices = similarities[0].argsort()[-top_n:][::-1] # Get indices of the top N results
73
+ top_items = [dataset["train"][i] for i in top_indices]
74
 
75
  # Format the output for display
76
  search_result = ""
77
  for item in top_items:
78
+ search_result += f"Title: {item['title']}, Description: {item['description']}, Genre: {item['listed_in']}\n"
79
 
80
  return search_result
81