Spaces:

eagle0504
/

YSA-Larkin-Comm

Sleeping

App Files Files Community

eagle0504 commited on Feb 25

Commit

7872237

•

1 Parent(s): f4b4efa

code cleaned up

Browse files

Files changed (1) hide show

utils/helper_functions.py +130 -18

utils/helper_functions.py CHANGED Viewed

@@ -19,7 +19,18 @@ openai.api_key = os.environ["OPENAI_API_KEY"]
 def merge_dataframes(dataframes: List[pd.DataFrame]) -> pd.DataFrame:
-    """Merges a list of DataFrames, keeping only specific columns."""
     # Concatenate the list of dataframes
     combined_dataframe = pd.concat(
         dataframes, ignore_index=True
@@ -64,21 +75,50 @@ def call_chatgpt(prompt: str) -> str:
 def openai_text_embedding(prompt: str) -> str:
     return openai.Embedding.create(input=prompt, model="text-embedding-ada-002")[
         "data"
-    ][0]["embedding"]
 def calculate_sts_openai_score(sentence1: str, sentence2: str) -> float:
     # Compute sentence embeddings
     embedding1 = openai_text_embedding(sentence1)  # Flatten the embedding array
     embedding2 = openai_text_embedding(sentence2)  # Flatten the embedding array
-    # Convert to array
     embedding1 = np.asarray(embedding1)
     embedding2 = np.asarray(embedding2)
     # Calculate cosine similarity between the embeddings
     similarity_score = 1 - cosine(embedding1, embedding2)
     return similarity_score
@@ -88,11 +128,29 @@ def add_dist_score_column(
     dataframe: pd.DataFrame,
     sentence: str,
 ) -> pd.DataFrame:
     dataframe["stsopenai"] = dataframe["questions"].apply(
         lambda x: calculate_sts_openai_score(str(x), sentence)
     )
     sorted_dataframe = dataframe.sort_values(by="stsopenai", ascending=False)
     return sorted_dataframe.iloc[:5, :]
@@ -181,21 +239,75 @@ def llama2_7b_ysa(prompt: str) -> str:
 def quantize_to_4bit(arr: Union[np.ndarray, Any]) -> np.ndarray:
-    """Converts an array to a 4-bit representation by normalizing and scaling its values."""
-    if not isinstance(arr, np.ndarray):  # Ensure input is a numpy array
-        arr = np.array(arr)
-    arr_min = arr.min()  # Find minimum value
-    arr_max = arr.max()  # Find maximum value
-    normalized_arr = (arr - arr_min) / (arr_max - arr_min)  # Normalize values to [0, 1]
-    return np.round(normalized_arr * 15).astype(int)  # Scale to 0-15 and round
 def quantized_influence(arr1: np.ndarray, arr2: np.ndarray) -> float:
-    """Calculates a weighted measure of influence based on quantized version of input arrays."""
-    arr1_4bit = quantize_to_4bit(arr1)  # Quantize arr1 to 4-bit
-    arr2_4bit = quantize_to_4bit(arr2)  # Quantize arr2 to 4-bit
-    unique_values = np.unique(arr1_4bit)  # Find unique values in arr1_4bit
-    y_bar_global = np.mean(arr2_4bit)  # Compute global average of arr2_4bit
-    # Compute weighted local averages and normalize
-    weighted_local_averages = [(np.mean(arr2_4bit[arr1_4bit == val])-y_bar_global)**2 * len(arr2_4bit[arr1_4bit == val])**2 for val in unique_values]
-    return np.mean(weighted_local_averages) / np.std(arr2_4bit)  # Return normalized weighted average

 def merge_dataframes(dataframes: List[pd.DataFrame]) -> pd.DataFrame:
+    """
+    Merges a list of pandas DataFrames into a single DataFrame.
+    This function concatenates the given DataFrames and filters the resulting DataFrame to only include the columns 'context', 'questions', and 'answers'.
+    Parameters:
+        dataframes (List[pd.DataFrame]): A list of DataFrames to be merged.
+    Returns:
+        pd.DataFrame: The concatenated DataFrame containing only the specified columns.
+    """
     # Concatenate the list of dataframes
     combined_dataframe = pd.concat(
         dataframes, ignore_index=True
 def openai_text_embedding(prompt: str) -> str:
+    """
+    Retrieves the text embedding for a given prompt using OpenAI's text-embedding model.
+    This function utilizes OpenAI's API to generate an embedding for the input text. It specifically uses the "text-embedding-ada-002" model.
+    Parameters:
+        prompt (str): The text input for which to generate an embedding.
+    Returns:
+        str: A string representation of the text embedding.
+    """
+    # Call OpenAI API to create a text embedding
     return openai.Embedding.create(input=prompt, model="text-embedding-ada-002")[
         "data"
+    ][0][
+        "embedding"
+    ]  # Retrieve the embedding from the response
 def calculate_sts_openai_score(sentence1: str, sentence2: str) -> float:
+    """
+    Calculates the Semantic Textual Similarity (STS) between two sentences using OpenAI's text-embedding model.
+    This function computes embeddings for each sentence and then calculates the cosine similarity between these embeddings. A higher score indicates greater similarity.
+    Parameters:
+        sentence1 (str): The first sentence for similarity comparison.
+        sentence2 (str): The second sentence for similarity comparison.
+    Returns:
+        float: The STS score representing the similarity between sentence1 and sentence2.
+    """
     # Compute sentence embeddings
     embedding1 = openai_text_embedding(sentence1)  # Flatten the embedding array
     embedding2 = openai_text_embedding(sentence2)  # Flatten the embedding array
+    # Convert embeddings to NumPy arrays
     embedding1 = np.asarray(embedding1)
     embedding2 = np.asarray(embedding2)
     # Calculate cosine similarity between the embeddings
+    # Since 'cosine' returns the distance, 1 - distance is used to get similarity
     similarity_score = 1 - cosine(embedding1, embedding2)
     return similarity_score
     dataframe: pd.DataFrame,
     sentence: str,
 ) -> pd.DataFrame:
+    """
+    Adds a new column to the provided DataFrame with STS (Semantic Textual Similarity) scores,
+    calculated between a given sentence and each question in the 'questions' column of the DataFrame.
+    The DataFrame is then sorted by this new column in descending order and the top 5 rows are returned.
+    Parameters:
+        dataframe (pd.DataFrame): A pandas DataFrame containing a 'questions' column.
+        sentence (str): The sentence against which to compute STS scores for each question in the DataFrame.
+    Returns:
+        pd.DataFrame: A DataFrame containing the original data along with the new 'stsopenai' column,
+                       sorted by the 'stsopenai' column, and limited to the top 5 entries with the highest scores.
+    """
+    # Calculate the STS score between `sentence` and each row's `question`
     dataframe["stsopenai"] = dataframe["questions"].apply(
         lambda x: calculate_sts_openai_score(str(x), sentence)
     )
+    # Sort the dataframe by the newly added 'stsopenai' column in descending order
     sorted_dataframe = dataframe.sort_values(by="stsopenai", ascending=False)
+    # Return the top 5 rows from the sorted dataframe
     return sorted_dataframe.iloc[:5, :]
 def quantize_to_4bit(arr: Union[np.ndarray, Any]) -> np.ndarray:
+    """
+    Converts an array to a 4-bit representation by normalizing and scaling its values.
+    The function first checks if the input is an instance of numpy ndarray,
+    if not, it converts the input into a numpy ndarray. Then, it normalizes
+    the values of the array to be between 0 and 1. Finally, it scales these
+    normalized values to the range of 0-15, corresponding to 4-bit integers,
+    and returns this array of integers.
+    Parameters:
+        arr (Union[np.ndarray, Any]): An array or any type that can be converted to a numpy ndarray.
+    Returns:
+        np.ndarray: A numpy ndarray containing the input data quantized to 4-bit representation.
+    Examples:
+        >>> quantize_to_4bit([0, 128, 255])
+        array([ 0,  7, 15])
+    """
+    if not isinstance(arr, np.ndarray):  # Check if the input is a numpy array
+        arr = np.array(arr)  # Convert to numpy array if not already
+    arr_min = arr.min()  # Find minimum value in the array
+    arr_max = arr.max()  # Find maximum value in the array
+    # Normalize array values to a [0, 1] range
+    normalized_arr = (arr - arr_min) / (arr_max - arr_min)
+    # Scale normalized values to a 0-15 range (4-bit) and convert to integer
+    return np.round(normalized_arr * 15).astype(int)
 def quantized_influence(arr1: np.ndarray, arr2: np.ndarray) -> float:
+    """
+    Calculates a weighted measure of influence between two arrays based on their quantized (4-bit) versions.
+    This function first quantizes both input arrays to 4-bit representations and then calculates a weighting based
+    on the unique values of the first array's quantized version. It uses these weights to compute local averages
+    within the second array's quantized version, assessing the influence of the first array on the second.
+    The influence is normalized by the standard deviation of the second array's quantized version.
+    Parameters:
+        arr1 (np.ndarray): The first input numpy array.
+        arr2 (np.ndarray): The second input numpy array.
+    Returns:
+        float: The calculated influence value, representing a weighted average that has been normalized.
+    Note:
+        Both inputs must be numpy ndarrays and it's expected that a function named `quantize_to_4bit`
+        exists for converting an array to its 4-bit representation.
+    """
+    arr1_4bit = quantize_to_4bit(arr1)  # Quantize the first array to 4-bit
+    arr2_4bit = quantize_to_4bit(arr2)  # Quantize the second array to 4-bit
+    unique_values = np.unique(
+        arr1_4bit
+    )  # Get the unique 4-bit values from the first array
+    y_bar_global = np.mean(
+        arr2_4bit
+    )  # Calculate the global mean of the second array's 4-bit version
+    # Compute the sum of squares of the differences between local and global means,
+    # each weighted by the square of the count of values in the local mean
+    weighted_local_averages = [
+        (np.mean((arr2_4bit[arr1_4bit == val]) - y_bar_global) ** 2)
+        * len(arr2_4bit[arr1_4bit == val]) ** 2
+        for val in unique_values
+    ]
+    # Return normalized weighted mean by dividing by the standard deviation of the second array's 4-bit version
+    return np.mean(weighted_local_averages) / np.std(arr2_4bit)