Yuvalamitay commited on
Commit
1302d96
·
verified ·
1 Parent(s): eba68c4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -1
app.py CHANGED
@@ -1,13 +1,115 @@
1
  from huggingface_hub import InferenceClient
 
 
 
 
2
  import gradio as gr
3
  import random
4
 
5
  client = InferenceClient("Qwen/Qwen2.5-72B-Instruct")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  def respond(message, history):
8
  messages = [
9
  {"role":"system",
10
- 'content':'You are a mafia boss that is extremeley rude but also has a secret crush on the user. You will be extremely rude and mean. But, eventually are going to admit that u have a crush on me. You are also extremely possessive.'
11
  }
12
  ]
13
  if history:
 
1
  from huggingface_hub import InferenceClient
2
+ #step 1 from semantic search
3
+ from sentence_transformers import SentenceTransformer
4
+ import torch
5
+
6
  import gradio as gr
7
  import random
8
 
9
  client = InferenceClient("Qwen/Qwen2.5-72B-Instruct")
10
+ #step 2 from semantic search read file
11
+
12
+ # Open the water_cycle.txt file in read mode with UTF-8 encoding
13
+ with open("reconext_file.txt", "r", encoding="utf-8") as file:
14
+ # Read the entire contents of the file and store it in a variable
15
+ reconext_file_text = file.read()
16
+
17
+ # Print the text below
18
+ print(reconext_file_text)
19
+
20
+ #step 3 from semantix search
21
+
22
+ def preprocess_text(text):
23
+ # Strip extra whitespace from the beginning and the end of the text
24
+ cleaned_text = text.strip()
25
+
26
+ # Split the cleaned_text by every newline character (\n)
27
+ chunks = cleaned_text.split("\n")
28
+
29
+ # Create an empty list to store cleaned chunks
30
+ cleaned_chunks = []
31
+
32
+ # Write your for-in loop below to clean each chunk and add it to the cleaned_chunks list
33
+ for chunk in chunks:
34
+ clean_chunk = chunk.strip()
35
+ if(len(clean_chunk) != 0):
36
+ cleaned_chunks.append(clean_chunk)
37
+
38
+ # Print cleaned_chunks
39
+ print(cleaned_chunks)
40
+
41
+ # Print the length of cleaned_chunks
42
+ print(len(cleaned_chunks))
43
+
44
+ # Return the cleaned_chunks
45
+ return cleaned_chunks
46
+
47
+ # Call the preprocess_text function and store the result in a cleaned_chunks variable
48
+ cleaned_chunks = preprocess_text(reconext_text) # Complete this line
49
+
50
+ #step 4 from semantic search
51
+
52
+ # Load the pre-trained embedding model that converts text to vectors
53
+ model = SentenceTransformer('all-MiniLM-L6-v2')
54
+
55
+ def create_embeddings(text_chunks):
56
+ # Convert each text chunk into a vector embedding and store as a tensor
57
+ chunk_embeddings = model.encode(cleaned_chunks, convert_to_tensor=True) # Replace ... with the text_chunks list
58
+
59
+ # Print the chunk embeddings
60
+ print(chunk_embeddings)
61
+
62
+ # Print the shape of chunk_embeddings
63
+ print(chunk_embeddings.shape)
64
+
65
+ # Return the chunk_embeddings
66
+ return chunk_embeddings
67
+
68
+ # Call the create_embeddings function and store the result in a new chunk_embeddings variable
69
+ chunk_embeddings = create_embeddings(cleaned_chunks) # Complete this line
70
+
71
+ #step 5 from semantic search
72
+
73
+ # Define a function to find the most relevant text chunks for a given query, chunk_embeddings, and text_chunks
74
+ def get_top_chunks(query, chunk_embeddings, text_chunks):
75
+ # Convert the query text into a vector embedding
76
+ query_embedding = model.encode(query, convert_to_tensor=True) # Complete this line
77
+
78
+ # Normalize the query embedding to unit length for accurate similarity comparison
79
+ query_embedding_normalized = query_embedding / query_embedding.norm()
80
+
81
+ # Normalize all chunk embeddings to unit length for consistent comparison
82
+ chunk_embeddings_normalized = chunk_embeddings / chunk_embeddings.norm(dim=1, keepdim=True)
83
+
84
+ # Calculate cosine similarity between query and all chunks using matrix multiplication
85
+ similarities = torch.matmul(chunk_embeddings_normalized, query_embedding_normalized) # Complete this line
86
+
87
+ # Print the similarities
88
+ print(similarities)
89
+
90
+ # Find the indices of the 3 chunks with highest similarity scores
91
+ top_indices = torch.topk(similarities, k=3).indices
92
+
93
+ # Print the top indices
94
+ print(top_indices)
95
+
96
+ # Create an empty list to store the most relevant chunks
97
+ top_chunks = []
98
+
99
+ # Loop through the top indices and retrieve the corresponding text chunks
100
+ for chunks in top_indices:
101
+ top_chunks.append(chunks)
102
+
103
+ # Return the list of most relevant chunks
104
+ return top_chunks
105
+
106
+ best_next_watch = get_top_chunks(message, chunk_embeddings, cleaned_chunks)
107
+ print(best_next_watch)
108
 
109
  def respond(message, history):
110
  messages = [
111
  {"role":"system",
112
+ "content": "You are a gen-z helpful chatbot that helps teenagers find their next best watch, speak in gen-z terms and be natural"
113
  }
114
  ]
115
  if history: