| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from pathlib import Path | |
| from loguru import logger | |
| class FeatureAnalyzer: | |
| def __init__(self,df,output_path): | |
| self.df=df | |
| self.output_path=output_path | |
| def plot_correlation_heatmap(self): | |
| Path(self.output_path).mkdir(parents=True, exist_ok=True) | |
| numeric_cols = self.df.select_dtypes(include=[np.number]).columns.drop('fake') | |
| correlation_matrix = self.df[numeric_cols].corr() | |
| plt.figure(figsize=(14, 12)) | |
| sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', vmin=-1, vmax=1, center=0) | |
| plt.title('Correlation Heatmap of Numeric Features', fontsize=16) | |
| plt.tight_layout() | |
| output_file = Path(self.output_path) / 'correlation_heatmap.png' | |
| plt.savefig(output_file, dpi=300, bbox_inches='tight') | |
| plt.close() | |
| logger.info(f"Saved correlation heatmap to {output_file}") | |
| def plot_mean_by_fake_bar(self): | |
| key_features = [ | |
| 'review_stars', 'business_stars', 'business_review_count', 'user_review_count', | |
| 'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count', | |
| 'time_since_last_review_user', 'user_account_age', 'pronoun_density', | |
| 'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews', | |
| 'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity' | |
| ] | |
| Path(self.output_path).mkdir(parents=True, exist_ok=True) | |
| mean_by_fake = self.df.groupby('fake')[key_features].mean().T | |
| mean_by_fake.columns = ['Genuine (0)', 'Fake (1)'] | |
| plt.figure(figsize=(12, 8)) | |
| mean_by_fake.plot(kind='bar', color=['skyblue', 'salmon'], width=0.8) | |
| plt.title('Mean Feature Values by Fake Label', fontsize=16) | |
| plt.xlabel('Features', fontsize=12) | |
| plt.ylabel('Mean Value', fontsize=12) | |
| plt.xticks(rotation=45, ha='right') | |
| plt.legend(title='Fake Label') | |
| plt.tight_layout() | |
| output_file = Path(self.output_path) / 'mean_by_fake_bar.png' | |
| plt.savefig(output_file, dpi=300, bbox_inches='tight') | |
| plt.close() | |
| logger.info(f"Saved mean by fake bar plot to {output_file}") | |
| def plot_violin_plots(self): | |
| key_features = [ | |
| 'review_stars', 'business_stars', 'business_review_count', 'user_review_count', | |
| 'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count', | |
| 'time_since_last_review_user', 'user_account_age', 'pronoun_density', | |
| 'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews', | |
| 'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity' | |
| ] | |
| Path(self.output_path).mkdir(parents=True, exist_ok=True) | |
| plt.figure(figsize=(14, 10)) | |
| for i, feature in enumerate(key_features[:6], 1): | |
| plt.subplot(2, 3, i) | |
| sns.violinplot(x='fake', y=feature, data=self.df, palette=['skyblue', 'salmon']) | |
| plt.title(f'{feature} Distribution', fontsize=12) | |
| plt.xlabel('Fake (0/1)', fontsize=10) | |
| plt.tight_layout() | |
| output_file = Path(self.output_path) / 'violin_plots.png' | |
| plt.savefig(output_file, dpi=300, bbox_inches='tight') | |
| plt.close() | |
| logger.info(f"Saved violin plots to {output_file}") | |
| def plot_box_plots(self): | |
| key_features = [ | |
| 'review_stars', 'business_stars', 'business_review_count', 'user_review_count', | |
| 'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count', | |
| 'time_since_last_review_user', 'user_account_age', 'pronoun_density', | |
| 'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews', | |
| 'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity' | |
| ] | |
| Path(self.output_path).mkdir(parents=True, exist_ok=True) | |
| plt.figure(figsize=(14, 10)) | |
| for i, feature in enumerate(key_features[6:11], 1): | |
| plt.subplot(2, 3, i) | |
| sns.boxplot(x='fake', y=feature, data=self.df, palette=['skyblue', 'salmon']) | |
| plt.title(f'{feature} Distribution', fontsize=12) | |
| plt.xlabel('Fake (0/1)', fontsize=10) | |
| plt.tight_layout() | |
| output_file = Path(self.output_path) / 'box_plots.png' | |
| plt.savefig(output_file, dpi=300, bbox_inches='tight') | |
| plt.close() | |
| logger.info(f"Saved box plots to {output_file}") | |
| def plot_scatter_review_grammar(self): | |
| Path(self.output_path).mkdir(parents=True, exist_ok=True) | |
| plt.figure(figsize=(10, 6)) | |
| sns.scatterplot(x='review_stars', y='grammar_error_score', hue='fake', data=self.df, palette=['blue', 'red'], alpha=0.5) | |
| plt.title('Review Stars vs Grammar Error Score by Fake Label', fontsize=16) | |
| plt.xlabel('Review Stars', fontsize=12) | |
| plt.ylabel('Grammar Error Score', fontsize=12) | |
| plt.legend(title='Fake') | |
| plt.tight_layout() | |
| output_file = Path(self.output_path) / 'scatter_review_grammar.png' | |
| plt.savefig(output_file, dpi=300, bbox_inches='tight') | |
| plt.close() | |
| logger.info(f"Saved scatter plot to {output_file}") | |
| def plot_density_plots(self): | |
| key_features = [ | |
| 'review_stars', 'business_stars', 'business_review_count', 'user_review_count', | |
| 'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count', | |
| 'time_since_last_review_user', 'user_account_age', 'pronoun_density', | |
| 'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews', | |
| 'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity' | |
| ] | |
| Path(self.output_path).mkdir(parents=True, exist_ok=True) | |
| plt.figure(figsize=(14, 10)) | |
| for i, feature in enumerate(key_features[:4], 1): | |
| plt.subplot(2, 2, i) | |
| for label in [0, 1]: | |
| subset = self.df[self.df['fake'] == label] | |
| sns.kdeplot(subset[feature], label=f'Fake={label}', fill=True, alpha=0.5) | |
| plt.title(f'{feature} Density', fontsize=12) | |
| plt.xlabel(feature, fontsize=10) | |
| plt.legend() | |
| plt.tight_layout() | |
| output_file = Path(self.output_path) / 'density_plots.png' | |
| plt.savefig(output_file, dpi=300, bbox_inches='tight') | |
| plt.close() | |
| logger.info(f"Saved density plots to {output_file}") | |
| def plot_stacked_bar_similarity(self): | |
| Path(self.output_path).mkdir(parents=True, exist_ok=True) | |
| bins = pd.cut(self.df['similarity_to_other_reviews'], bins=10) | |
| stacked_data = self.df.groupby([bins, 'fake']).size().unstack(fill_value=0) | |
| stacked_data = stacked_data.div(stacked_data.sum(axis=1), axis=0) | |
| plt.figure(figsize=(12, 8)) | |
| stacked_data.plot(kind='bar', stacked=True, color=['skyblue', 'salmon'], width=0.8) | |
| plt.title('Proportion of Fake by Similarity to Other Reviews Bins', fontsize=16) | |
| plt.xlabel('Similarity Bins', fontsize=12) | |
| plt.ylabel('Proportion', fontsize=12) | |
| plt.legend(['Genuine (0)', 'Fake (1)'], title='Fake Label') | |
| plt.xticks(rotation=45, ha='right') | |
| plt.tight_layout() | |
| output_file = Path(self.output_path) / 'stacked_bar_similarity.png' | |
| plt.savefig(output_file, dpi=300, bbox_inches='tight') | |
| plt.close() | |
| logger.info(f"Saved stacked bar plot to {output_file}") | |
| def plot_pie_fake_distribution(self): | |
| Path(self.output_path).mkdir(parents=True, exist_ok=True) | |
| fake_counts = self.df['fake'].value_counts() | |
| plt.figure(figsize=(8, 8)) | |
| plt.pie(fake_counts, labels=['Genuine (0)', 'Fake (1)'], colors=['skyblue', 'salmon'], autopct='%1.1f%%', startangle=90) | |
| plt.title('Distribution of Fake Labels', fontsize=16) | |
| plt.axis('equal') | |
| output_file = Path(self.output_path) / 'pie_fake_distribution.png' | |
| plt.savefig(output_file, dpi=300, bbox_inches='tight') | |
| plt.close() | |
| logger.info(f"Saved pie chart to {output_file}") | |
| def plot_count_code_switching(self): | |
| Path(self.output_path).mkdir(parents=True, exist_ok=True) | |
| plt.figure(figsize=(8, 6)) | |
| sns.countplot(x='code_switching_flag', hue='fake', data=self.df, palette=['skyblue', 'salmon']) | |
| plt.title('Count of Fake by Code Switching Flag', fontsize=16) | |
| plt.xlabel('Code Switching Flag (0/1)', fontsize=12) | |
| plt.ylabel('Count', fontsize=12) | |
| plt.legend(title='Fake Label') | |
| plt.tight_layout() | |
| output_file = Path(self.output_path) / 'count_code_switching.png' | |
| plt.savefig(output_file, dpi=300, bbox_inches='tight') | |
| plt.close() | |
| logger.info(f"Saved count plot to {output_file}") | |
| def plot_variance_by_fake_bar(self): | |
| key_features = [ | |
| 'review_stars', 'business_stars', 'business_review_count', 'user_review_count', | |
| 'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count', | |
| 'time_since_last_review_user', 'user_account_age', 'pronoun_density', | |
| 'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews', | |
| 'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity' | |
| ] | |
| Path(self.output_path).mkdir(parents=True, exist_ok=True) | |
| variance_by_fake = self.df.groupby('fake')[key_features].var().T | |
| variance_by_fake.columns = ['Genuine (0)', 'Fake (1)'] | |
| plt.figure(figsize=(12, 8)) | |
| variance_by_fake.plot(kind='bar', color=['skyblue', 'salmon'], width=0.8) | |
| plt.title('Feature Variance by Fake Label', fontsize=16) | |
| plt.xlabel('Features', fontsize=12) | |
| plt.ylabel('Variance', fontsize=12) | |
| plt.xticks(rotation=45, ha='right') | |
| plt.legend(title='Fake Label') | |
| plt.tight_layout() | |
| output_file = Path(self.output_path) / 'variance_by_fake_bar.png' | |
| plt.savefig(output_file, dpi=300, bbox_inches='tight') | |
| plt.close() | |
| logger.info(f"Saved variance bar plot to {output_file}") | |
| def run_pipeline(self): | |
| sns.set(style="whitegrid") | |
| plt.rcParams['figure.figsize'] = (12, 8) | |
| self.plot_correlation_heatmap() | |
| self.plot_mean_by_fake_bar() | |
| self.plot_violin_plots() | |
| self.plot_box_plots() | |
| self.plot_scatter_review_grammar() | |
| self.plot_density_plots() | |
| self.plot_stacked_bar_similarity() | |
| self.plot_pie_fake_distribution() | |
| self.plot_count_code_switching() | |
| self.plot_variance_by_fake_bar() |