eagle0504 commited on
Commit
7872237
1 Parent(s): f4b4efa

code cleaned up

Browse files
Files changed (1) hide show
  1. utils/helper_functions.py +130 -18
utils/helper_functions.py CHANGED
@@ -19,7 +19,18 @@ openai.api_key = os.environ["OPENAI_API_KEY"]
19
 
20
 
21
  def merge_dataframes(dataframes: List[pd.DataFrame]) -> pd.DataFrame:
22
- """Merges a list of DataFrames, keeping only specific columns."""
 
 
 
 
 
 
 
 
 
 
 
23
  # Concatenate the list of dataframes
24
  combined_dataframe = pd.concat(
25
  dataframes, ignore_index=True
@@ -64,21 +75,50 @@ def call_chatgpt(prompt: str) -> str:
64
 
65
 
66
  def openai_text_embedding(prompt: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  return openai.Embedding.create(input=prompt, model="text-embedding-ada-002")[
68
  "data"
69
- ][0]["embedding"]
 
 
70
 
71
 
72
  def calculate_sts_openai_score(sentence1: str, sentence2: str) -> float:
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  # Compute sentence embeddings
74
  embedding1 = openai_text_embedding(sentence1) # Flatten the embedding array
75
  embedding2 = openai_text_embedding(sentence2) # Flatten the embedding array
76
 
77
- # Convert to array
78
  embedding1 = np.asarray(embedding1)
79
  embedding2 = np.asarray(embedding2)
80
 
81
  # Calculate cosine similarity between the embeddings
 
82
  similarity_score = 1 - cosine(embedding1, embedding2)
83
 
84
  return similarity_score
@@ -88,11 +128,29 @@ def add_dist_score_column(
88
  dataframe: pd.DataFrame,
89
  sentence: str,
90
  ) -> pd.DataFrame:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  dataframe["stsopenai"] = dataframe["questions"].apply(
92
  lambda x: calculate_sts_openai_score(str(x), sentence)
93
  )
94
 
 
95
  sorted_dataframe = dataframe.sort_values(by="stsopenai", ascending=False)
 
 
96
  return sorted_dataframe.iloc[:5, :]
97
 
98
 
@@ -181,21 +239,75 @@ def llama2_7b_ysa(prompt: str) -> str:
181
 
182
 
183
  def quantize_to_4bit(arr: Union[np.ndarray, Any]) -> np.ndarray:
184
- """Converts an array to a 4-bit representation by normalizing and scaling its values."""
185
- if not isinstance(arr, np.ndarray): # Ensure input is a numpy array
186
- arr = np.array(arr)
187
- arr_min = arr.min() # Find minimum value
188
- arr_max = arr.max() # Find maximum value
189
- normalized_arr = (arr - arr_min) / (arr_max - arr_min) # Normalize values to [0, 1]
190
- return np.round(normalized_arr * 15).astype(int) # Scale to 0-15 and round
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
 
193
  def quantized_influence(arr1: np.ndarray, arr2: np.ndarray) -> float:
194
- """Calculates a weighted measure of influence based on quantized version of input arrays."""
195
- arr1_4bit = quantize_to_4bit(arr1) # Quantize arr1 to 4-bit
196
- arr2_4bit = quantize_to_4bit(arr2) # Quantize arr2 to 4-bit
197
- unique_values = np.unique(arr1_4bit) # Find unique values in arr1_4bit
198
- y_bar_global = np.mean(arr2_4bit) # Compute global average of arr2_4bit
199
- # Compute weighted local averages and normalize
200
- weighted_local_averages = [(np.mean(arr2_4bit[arr1_4bit == val])-y_bar_global)**2 * len(arr2_4bit[arr1_4bit == val])**2 for val in unique_values]
201
- return np.mean(weighted_local_averages) / np.std(arr2_4bit) # Return normalized weighted average
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
 
21
  def merge_dataframes(dataframes: List[pd.DataFrame]) -> pd.DataFrame:
22
+ """
23
+ Merges a list of pandas DataFrames into a single DataFrame.
24
+
25
+ This function concatenates the given DataFrames and filters the resulting DataFrame to only include the columns 'context', 'questions', and 'answers'.
26
+
27
+ Parameters:
28
+ dataframes (List[pd.DataFrame]): A list of DataFrames to be merged.
29
+
30
+ Returns:
31
+ pd.DataFrame: The concatenated DataFrame containing only the specified columns.
32
+ """
33
+
34
  # Concatenate the list of dataframes
35
  combined_dataframe = pd.concat(
36
  dataframes, ignore_index=True
 
75
 
76
 
77
  def openai_text_embedding(prompt: str) -> str:
78
+ """
79
+ Retrieves the text embedding for a given prompt using OpenAI's text-embedding model.
80
+
81
+ This function utilizes OpenAI's API to generate an embedding for the input text. It specifically uses the "text-embedding-ada-002" model.
82
+
83
+ Parameters:
84
+ prompt (str): The text input for which to generate an embedding.
85
+
86
+ Returns:
87
+ str: A string representation of the text embedding.
88
+ """
89
+
90
+ # Call OpenAI API to create a text embedding
91
  return openai.Embedding.create(input=prompt, model="text-embedding-ada-002")[
92
  "data"
93
+ ][0][
94
+ "embedding"
95
+ ] # Retrieve the embedding from the response
96
 
97
 
98
  def calculate_sts_openai_score(sentence1: str, sentence2: str) -> float:
99
+ """
100
+ Calculates the Semantic Textual Similarity (STS) between two sentences using OpenAI's text-embedding model.
101
+
102
+ This function computes embeddings for each sentence and then calculates the cosine similarity between these embeddings. A higher score indicates greater similarity.
103
+
104
+ Parameters:
105
+ sentence1 (str): The first sentence for similarity comparison.
106
+ sentence2 (str): The second sentence for similarity comparison.
107
+
108
+ Returns:
109
+ float: The STS score representing the similarity between sentence1 and sentence2.
110
+ """
111
+
112
  # Compute sentence embeddings
113
  embedding1 = openai_text_embedding(sentence1) # Flatten the embedding array
114
  embedding2 = openai_text_embedding(sentence2) # Flatten the embedding array
115
 
116
+ # Convert embeddings to NumPy arrays
117
  embedding1 = np.asarray(embedding1)
118
  embedding2 = np.asarray(embedding2)
119
 
120
  # Calculate cosine similarity between the embeddings
121
+ # Since 'cosine' returns the distance, 1 - distance is used to get similarity
122
  similarity_score = 1 - cosine(embedding1, embedding2)
123
 
124
  return similarity_score
 
128
  dataframe: pd.DataFrame,
129
  sentence: str,
130
  ) -> pd.DataFrame:
131
+ """
132
+ Adds a new column to the provided DataFrame with STS (Semantic Textual Similarity) scores,
133
+ calculated between a given sentence and each question in the 'questions' column of the DataFrame.
134
+ The DataFrame is then sorted by this new column in descending order and the top 5 rows are returned.
135
+
136
+ Parameters:
137
+ dataframe (pd.DataFrame): A pandas DataFrame containing a 'questions' column.
138
+ sentence (str): The sentence against which to compute STS scores for each question in the DataFrame.
139
+
140
+ Returns:
141
+ pd.DataFrame: A DataFrame containing the original data along with the new 'stsopenai' column,
142
+ sorted by the 'stsopenai' column, and limited to the top 5 entries with the highest scores.
143
+ """
144
+
145
+ # Calculate the STS score between `sentence` and each row's `question`
146
  dataframe["stsopenai"] = dataframe["questions"].apply(
147
  lambda x: calculate_sts_openai_score(str(x), sentence)
148
  )
149
 
150
+ # Sort the dataframe by the newly added 'stsopenai' column in descending order
151
  sorted_dataframe = dataframe.sort_values(by="stsopenai", ascending=False)
152
+
153
+ # Return the top 5 rows from the sorted dataframe
154
  return sorted_dataframe.iloc[:5, :]
155
 
156
 
 
239
 
240
 
241
  def quantize_to_4bit(arr: Union[np.ndarray, Any]) -> np.ndarray:
242
+ """
243
+ Converts an array to a 4-bit representation by normalizing and scaling its values.
244
+
245
+ The function first checks if the input is an instance of numpy ndarray,
246
+ if not, it converts the input into a numpy ndarray. Then, it normalizes
247
+ the values of the array to be between 0 and 1. Finally, it scales these
248
+ normalized values to the range of 0-15, corresponding to 4-bit integers,
249
+ and returns this array of integers.
250
+
251
+ Parameters:
252
+ arr (Union[np.ndarray, Any]): An array or any type that can be converted to a numpy ndarray.
253
+
254
+ Returns:
255
+ np.ndarray: A numpy ndarray containing the input data quantized to 4-bit representation.
256
+
257
+ Examples:
258
+ >>> quantize_to_4bit([0, 128, 255])
259
+ array([ 0, 7, 15])
260
+ """
261
+ if not isinstance(arr, np.ndarray): # Check if the input is a numpy array
262
+ arr = np.array(arr) # Convert to numpy array if not already
263
+
264
+ arr_min = arr.min() # Find minimum value in the array
265
+ arr_max = arr.max() # Find maximum value in the array
266
+
267
+ # Normalize array values to a [0, 1] range
268
+ normalized_arr = (arr - arr_min) / (arr_max - arr_min)
269
+
270
+ # Scale normalized values to a 0-15 range (4-bit) and convert to integer
271
+ return np.round(normalized_arr * 15).astype(int)
272
 
273
 
274
  def quantized_influence(arr1: np.ndarray, arr2: np.ndarray) -> float:
275
+ """
276
+ Calculates a weighted measure of influence between two arrays based on their quantized (4-bit) versions.
277
+
278
+ This function first quantizes both input arrays to 4-bit representations and then calculates a weighting based
279
+ on the unique values of the first array's quantized version. It uses these weights to compute local averages
280
+ within the second array's quantized version, assessing the influence of the first array on the second.
281
+ The influence is normalized by the standard deviation of the second array's quantized version.
282
+
283
+ Parameters:
284
+ arr1 (np.ndarray): The first input numpy array.
285
+ arr2 (np.ndarray): The second input numpy array.
286
+
287
+ Returns:
288
+ float: The calculated influence value, representing a weighted average that has been normalized.
289
+
290
+ Note:
291
+ Both inputs must be numpy ndarrays and it's expected that a function named `quantize_to_4bit`
292
+ exists for converting an array to its 4-bit representation.
293
+ """
294
+ arr1_4bit = quantize_to_4bit(arr1) # Quantize the first array to 4-bit
295
+ arr2_4bit = quantize_to_4bit(arr2) # Quantize the second array to 4-bit
296
+
297
+ unique_values = np.unique(
298
+ arr1_4bit
299
+ ) # Get the unique 4-bit values from the first array
300
+ y_bar_global = np.mean(
301
+ arr2_4bit
302
+ ) # Calculate the global mean of the second array's 4-bit version
303
+
304
+ # Compute the sum of squares of the differences between local and global means,
305
+ # each weighted by the square of the count of values in the local mean
306
+ weighted_local_averages = [
307
+ (np.mean((arr2_4bit[arr1_4bit == val]) - y_bar_global) ** 2)
308
+ * len(arr2_4bit[arr1_4bit == val]) ** 2
309
+ for val in unique_values
310
+ ]
311
+
312
+ # Return normalized weighted mean by dividing by the standard deviation of the second array's 4-bit version
313
+ return np.mean(weighted_local_averages) / np.std(arr2_4bit)