smm / src /input_analysis /Standarddeviation.py
Sanket17's picture
Update src/input_analysis/Standarddeviation.py
1eb5cdf verified
import pandas as pd
import numpy as np
# Load Excel files
product_data = pd.read_excel("Output_File/excel/product_analysis.xlsx")
competitor_data = pd.read_excel("Output_File/excel/competitor_analysis.xlsx")
# Function to filter criteria based on available columns
def filter_existing_criteria(data, criteria):
"""
Filters a list of criteria to include only those present in the given DataFrame columns.
Args:
data: DataFrame containing the data.
criteria: List of criteria to filter.
Returns:
List of filtered criteria that exist in the DataFrame columns.
"""
return [criterion for criterion in criteria if criterion in data.columns]
# Define criteria for each category
branding_criteria = [
"Logo Placement", "Consistency", "Alignment",
"Brand Colors", "Typography Consistency", "Brand Identity", "Template Consistency"
]
content_marketing_criteria = [
"Content Visibility", "Engagement Cues", "Storytelling",
"Aesthetic Coherence", "Content Relevance"
]
social_media_marketing_criteria = [
"Font Size", "Visibility of Text", "Alignment",
"Aesthetic Appeal", "Repetitiveness"
]
# Filter criteria based on available columns in the data
branding_criteria_filtered = filter_existing_criteria(product_data, branding_criteria)
content_marketing_criteria_filtered = filter_existing_criteria(product_data, content_marketing_criteria)
social_media_marketing_criteria_filtered = filter_existing_criteria(product_data, social_media_marketing_criteria)
# Helper function to calculate the mean value for a criterion
def calculate_mean_criterion_value(product_data, competitor_data, criterion):
"""
Calculate the mean of a criterion from both product and competitor data, ignoring NaN values.
Args:
product_data: DataFrame containing product data.
competitor_data: DataFrame containing competitor data.
criterion: The column name representing the criterion.
Returns:
Mean value of the criterion.
"""
combined_values = np.concatenate([
product_data[criterion].dropna().values,
competitor_data[criterion].dropna().values,
])
return np.nanmean(combined_values)
# Helper function to calculate the score differences
def calculate_score_differences(product_scores, competitor_scores, product_data, competitor_data, category_criteria):
"""
Calculate the score differences, replacing NaN with mean values.
Args:
product_scores: Scores of a product post for the criteria.
competitor_scores: Scores of a competitor post for the criteria.
product_data: DataFrame containing product data.
competitor_data: DataFrame containing competitor data.
category_criteria: List of criteria in the category.
Returns:
Array of score differences.
"""
score_diff = product_scores - competitor_scores
for k, diff in enumerate(score_diff):
if np.isnan(diff):
mean_value = calculate_mean_criterion_value(product_data, competitor_data, category_criteria[k])
score_diff[k] = mean_value
return score_diff
# Main function to calculate the SD comparison matrix
def calculate_sd_comparison_matrix(product_data, competitor_data, category_criteria):
"""
Calculate a 6x6 SD Comparison Matrix for a specific category, replacing NaN differences with the mean of the criteria.
"""
# If there are no criteria, return a DataFrame of zeros
if not category_criteria:
return pd.DataFrame(
np.zeros((6, 6)),
index=[f"Product_{i+1}" for i in range(6)],
columns=[f"Competitor_{j+1}" for j in range(6)]
)
sd_matrix = np.zeros((6, 6)) # Initialize a 6x6 matrix for SD values
for i in range(6): # Loop over product posts
for j in range(6): # Loop over competitor posts
product_scores = product_data.iloc[i][category_criteria].values
competitor_scores = competitor_data.iloc[j][category_criteria].values
# Calculate score differences
score_diff = calculate_score_differences(
product_scores, competitor_scores, product_data, competitor_data, category_criteria
)
# Safely calculate the standard deviation
if len(score_diff) > 0:
sd_matrix[i, j] = np.std(score_diff)
else:
sd_matrix[i, j] = 0
# Convert to DataFrame for better readability
return pd.DataFrame(sd_matrix,
index=[f"Product_{i+1}" for i in range(6)],
columns=[f"Competitor_{j+1}" for j in range(6)])
# Calculate SD matrices for each category
branding_sd_matrix = calculate_sd_comparison_matrix(product_data, competitor_data, branding_criteria_filtered)
content_marketing_sd_matrix = calculate_sd_comparison_matrix(product_data, competitor_data, content_marketing_criteria_filtered)
social_media_marketing_sd_matrix = calculate_sd_comparison_matrix(product_data, competitor_data, social_media_marketing_criteria_filtered)
# Function to find the top SD values ensuring non-repetitive product and competitor image pairs
def find_top_non_repetitive_sd(sd_matrix, product_data, competitor_data, category, top_count=3):
"""
Find the top SD values ensuring non-repetitive product and competitor image pairs within the same category.
Args:
sd_matrix: DataFrame representing the SD matrix.
product_data: DataFrame containing product data (to extract image names).
competitor_data: DataFrame containing competitor data (to extract image names).
category: String representing the category name.
top_count: Number of top results to return (default is 3).
Returns:
List of tuples containing category, product image name, competitor image name, and SD value.
"""
used_product_images = set()
used_competitor_images = set()
top_results = []
for i in range(sd_matrix.shape[0]):
for j in range(sd_matrix.shape[1]):
if len(top_results) == top_count:
break
product_image = product_data.iloc[i]['Image']
competitor_image = competitor_data.iloc[j]['Image']
sd_value = sd_matrix.iloc[i, j]
if product_image not in used_product_images and competitor_image not in used_competitor_images:
top_results.append((category, product_image, competitor_image, sd_value))
used_product_images.add(product_image)
used_competitor_images.add(competitor_image)
return top_results
# Find top non-repetitive SD results
branding_top_3 = find_top_non_repetitive_sd(branding_sd_matrix, product_data, competitor_data, "Brand Marketing")
content_marketing_top_3 = find_top_non_repetitive_sd(content_marketing_sd_matrix, product_data, competitor_data, "Content Marketing")
social_media_marketing_top_3 = find_top_non_repetitive_sd(social_media_marketing_sd_matrix, product_data, competitor_data, "Social Media Marketing")
# Combine results into a DataFrame
all_top_3 = branding_top_3 + content_marketing_top_3 + social_media_marketing_top_3
top_3_df = pd.DataFrame(
all_top_3,
columns=['Category', 'Product_Image_Name', 'Competitor_Image_Name', 'SD_Value']
)
# Save results to Excel
top_3_df.to_excel("Output_File/excel/top_3_sd_results.xlsx", index=False)
# Print the results
print("\nTop 3 SD Results DataFrame:")
print(top_3_df)
import os
output_folder = "data/output_generated_file/Output_File/excel"
output_file_path = os.path.join(output_folder, "top_3_sd_results.xlsx")
os.makedirs(output_folder, exist_ok=True)
# Save results to the specified folder
top_3_df.to_excel(output_file_path, index=False)
# Print confirmation
print(f"Top 3 SD Results saved in: {output_file_path}")