import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from pathlib import Path from loguru import logger class FeatureAnalyzer: def __init__(self,df,output_path): self.df=df self.output_path=output_path def plot_correlation_heatmap(self): Path(self.output_path).mkdir(parents=True, exist_ok=True) numeric_cols = self.df.select_dtypes(include=[np.number]).columns.drop('fake') correlation_matrix = self.df[numeric_cols].corr() plt.figure(figsize=(14, 12)) sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', vmin=-1, vmax=1, center=0) plt.title('Correlation Heatmap of Numeric Features', fontsize=16) plt.tight_layout() output_file = Path(self.output_path) / 'correlation_heatmap.png' plt.savefig(output_file, dpi=300, bbox_inches='tight') plt.close() logger.info(f"Saved correlation heatmap to {output_file}") def plot_mean_by_fake_bar(self): key_features = [ 'review_stars', 'business_stars', 'business_review_count', 'user_review_count', 'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count', 'time_since_last_review_user', 'user_account_age', 'pronoun_density', 'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews', 'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity' ] Path(self.output_path).mkdir(parents=True, exist_ok=True) mean_by_fake = self.df.groupby('fake')[key_features].mean().T mean_by_fake.columns = ['Genuine (0)', 'Fake (1)'] plt.figure(figsize=(12, 8)) mean_by_fake.plot(kind='bar', color=['skyblue', 'salmon'], width=0.8) plt.title('Mean Feature Values by Fake Label', fontsize=16) plt.xlabel('Features', fontsize=12) plt.ylabel('Mean Value', fontsize=12) plt.xticks(rotation=45, ha='right') plt.legend(title='Fake Label') plt.tight_layout() output_file = Path(self.output_path) / 'mean_by_fake_bar.png' plt.savefig(output_file, dpi=300, bbox_inches='tight') plt.close() logger.info(f"Saved mean by fake bar plot to {output_file}") def plot_violin_plots(self): key_features = [ 'review_stars', 'business_stars', 'business_review_count', 'user_review_count', 'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count', 'time_since_last_review_user', 'user_account_age', 'pronoun_density', 'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews', 'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity' ] Path(self.output_path).mkdir(parents=True, exist_ok=True) plt.figure(figsize=(14, 10)) for i, feature in enumerate(key_features[:6], 1): plt.subplot(2, 3, i) sns.violinplot(x='fake', y=feature, data=self.df, palette=['skyblue', 'salmon']) plt.title(f'{feature} Distribution', fontsize=12) plt.xlabel('Fake (0/1)', fontsize=10) plt.tight_layout() output_file = Path(self.output_path) / 'violin_plots.png' plt.savefig(output_file, dpi=300, bbox_inches='tight') plt.close() logger.info(f"Saved violin plots to {output_file}") def plot_box_plots(self): key_features = [ 'review_stars', 'business_stars', 'business_review_count', 'user_review_count', 'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count', 'time_since_last_review_user', 'user_account_age', 'pronoun_density', 'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews', 'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity' ] Path(self.output_path).mkdir(parents=True, exist_ok=True) plt.figure(figsize=(14, 10)) for i, feature in enumerate(key_features[6:11], 1): plt.subplot(2, 3, i) sns.boxplot(x='fake', y=feature, data=self.df, palette=['skyblue', 'salmon']) plt.title(f'{feature} Distribution', fontsize=12) plt.xlabel('Fake (0/1)', fontsize=10) plt.tight_layout() output_file = Path(self.output_path) / 'box_plots.png' plt.savefig(output_file, dpi=300, bbox_inches='tight') plt.close() logger.info(f"Saved box plots to {output_file}") def plot_scatter_review_grammar(self): Path(self.output_path).mkdir(parents=True, exist_ok=True) plt.figure(figsize=(10, 6)) sns.scatterplot(x='review_stars', y='grammar_error_score', hue='fake', data=self.df, palette=['blue', 'red'], alpha=0.5) plt.title('Review Stars vs Grammar Error Score by Fake Label', fontsize=16) plt.xlabel('Review Stars', fontsize=12) plt.ylabel('Grammar Error Score', fontsize=12) plt.legend(title='Fake') plt.tight_layout() output_file = Path(self.output_path) / 'scatter_review_grammar.png' plt.savefig(output_file, dpi=300, bbox_inches='tight') plt.close() logger.info(f"Saved scatter plot to {output_file}") def plot_density_plots(self): key_features = [ 'review_stars', 'business_stars', 'business_review_count', 'user_review_count', 'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count', 'time_since_last_review_user', 'user_account_age', 'pronoun_density', 'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews', 'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity' ] Path(self.output_path).mkdir(parents=True, exist_ok=True) plt.figure(figsize=(14, 10)) for i, feature in enumerate(key_features[:4], 1): plt.subplot(2, 2, i) for label in [0, 1]: subset = self.df[self.df['fake'] == label] sns.kdeplot(subset[feature], label=f'Fake={label}', fill=True, alpha=0.5) plt.title(f'{feature} Density', fontsize=12) plt.xlabel(feature, fontsize=10) plt.legend() plt.tight_layout() output_file = Path(self.output_path) / 'density_plots.png' plt.savefig(output_file, dpi=300, bbox_inches='tight') plt.close() logger.info(f"Saved density plots to {output_file}") def plot_stacked_bar_similarity(self): Path(self.output_path).mkdir(parents=True, exist_ok=True) bins = pd.cut(self.df['similarity_to_other_reviews'], bins=10) stacked_data = self.df.groupby([bins, 'fake']).size().unstack(fill_value=0) stacked_data = stacked_data.div(stacked_data.sum(axis=1), axis=0) plt.figure(figsize=(12, 8)) stacked_data.plot(kind='bar', stacked=True, color=['skyblue', 'salmon'], width=0.8) plt.title('Proportion of Fake by Similarity to Other Reviews Bins', fontsize=16) plt.xlabel('Similarity Bins', fontsize=12) plt.ylabel('Proportion', fontsize=12) plt.legend(['Genuine (0)', 'Fake (1)'], title='Fake Label') plt.xticks(rotation=45, ha='right') plt.tight_layout() output_file = Path(self.output_path) / 'stacked_bar_similarity.png' plt.savefig(output_file, dpi=300, bbox_inches='tight') plt.close() logger.info(f"Saved stacked bar plot to {output_file}") def plot_pie_fake_distribution(self): Path(self.output_path).mkdir(parents=True, exist_ok=True) fake_counts = self.df['fake'].value_counts() plt.figure(figsize=(8, 8)) plt.pie(fake_counts, labels=['Genuine (0)', 'Fake (1)'], colors=['skyblue', 'salmon'], autopct='%1.1f%%', startangle=90) plt.title('Distribution of Fake Labels', fontsize=16) plt.axis('equal') output_file = Path(self.output_path) / 'pie_fake_distribution.png' plt.savefig(output_file, dpi=300, bbox_inches='tight') plt.close() logger.info(f"Saved pie chart to {output_file}") def plot_count_code_switching(self): Path(self.output_path).mkdir(parents=True, exist_ok=True) plt.figure(figsize=(8, 6)) sns.countplot(x='code_switching_flag', hue='fake', data=self.df, palette=['skyblue', 'salmon']) plt.title('Count of Fake by Code Switching Flag', fontsize=16) plt.xlabel('Code Switching Flag (0/1)', fontsize=12) plt.ylabel('Count', fontsize=12) plt.legend(title='Fake Label') plt.tight_layout() output_file = Path(self.output_path) / 'count_code_switching.png' plt.savefig(output_file, dpi=300, bbox_inches='tight') plt.close() logger.info(f"Saved count plot to {output_file}") def plot_variance_by_fake_bar(self): key_features = [ 'review_stars', 'business_stars', 'business_review_count', 'user_review_count', 'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count', 'time_since_last_review_user', 'user_account_age', 'pronoun_density', 'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews', 'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity' ] Path(self.output_path).mkdir(parents=True, exist_ok=True) variance_by_fake = self.df.groupby('fake')[key_features].var().T variance_by_fake.columns = ['Genuine (0)', 'Fake (1)'] plt.figure(figsize=(12, 8)) variance_by_fake.plot(kind='bar', color=['skyblue', 'salmon'], width=0.8) plt.title('Feature Variance by Fake Label', fontsize=16) plt.xlabel('Features', fontsize=12) plt.ylabel('Variance', fontsize=12) plt.xticks(rotation=45, ha='right') plt.legend(title='Fake Label') plt.tight_layout() output_file = Path(self.output_path) / 'variance_by_fake_bar.png' plt.savefig(output_file, dpi=300, bbox_inches='tight') plt.close() logger.info(f"Saved variance bar plot to {output_file}") def run_pipeline(self): sns.set(style="whitegrid") plt.rcParams['figure.figsize'] = (12, 8) self.plot_correlation_heatmap() self.plot_mean_by_fake_bar() self.plot_violin_plots() self.plot_box_plots() self.plot_scatter_review_grammar() self.plot_density_plots() self.plot_stacked_bar_similarity() self.plot_pie_fake_distribution() self.plot_count_code_switching() self.plot_variance_by_fake_bar()