Spaces:

Askhedi
/

flaskapp

Sleeping

File size: 10,714 Bytes

67b1c6c

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from loguru import logger

class FeatureAnalyzer:
    def __init__(self,df,output_path):
        self.df=df
        self.output_path=output_path
    

    def plot_correlation_heatmap(self):
        Path(self.output_path).mkdir(parents=True, exist_ok=True)
        numeric_cols = self.df.select_dtypes(include=[np.number]).columns.drop('fake')
        correlation_matrix = self.df[numeric_cols].corr()
        plt.figure(figsize=(14, 12))
        sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', vmin=-1, vmax=1, center=0)
        plt.title('Correlation Heatmap of Numeric Features', fontsize=16)
        plt.tight_layout()
        output_file = Path(self.output_path) / 'correlation_heatmap.png'
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        plt.close()
        logger.info(f"Saved correlation heatmap to {output_file}")

    def plot_mean_by_fake_bar(self):
        key_features = [
            'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
            'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
            'time_since_last_review_user', 'user_account_age', 'pronoun_density',
            'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
            'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
        ]
        Path(self.output_path).mkdir(parents=True, exist_ok=True)
        mean_by_fake = self.df.groupby('fake')[key_features].mean().T
        mean_by_fake.columns = ['Genuine (0)', 'Fake (1)']
        plt.figure(figsize=(12, 8))
        mean_by_fake.plot(kind='bar', color=['skyblue', 'salmon'], width=0.8)
        plt.title('Mean Feature Values by Fake Label', fontsize=16)
        plt.xlabel('Features', fontsize=12)
        plt.ylabel('Mean Value', fontsize=12)
        plt.xticks(rotation=45, ha='right')
        plt.legend(title='Fake Label')
        plt.tight_layout()
        output_file = Path(self.output_path) / 'mean_by_fake_bar.png'
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        plt.close()
        logger.info(f"Saved mean by fake bar plot to {output_file}")

    def plot_violin_plots(self):
        key_features = [
            'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
            'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
            'time_since_last_review_user', 'user_account_age', 'pronoun_density',
            'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
            'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
        ]
        Path(self.output_path).mkdir(parents=True, exist_ok=True)
        plt.figure(figsize=(14, 10))
        for i, feature in enumerate(key_features[:6], 1):
            plt.subplot(2, 3, i)
            sns.violinplot(x='fake', y=feature, data=self.df, palette=['skyblue', 'salmon'])
            plt.title(f'{feature} Distribution', fontsize=12)
            plt.xlabel('Fake (0/1)', fontsize=10)
        plt.tight_layout()
        output_file = Path(self.output_path) / 'violin_plots.png'
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        plt.close()
        logger.info(f"Saved violin plots to {output_file}")

    def plot_box_plots(self):
        key_features = [
            'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
            'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
            'time_since_last_review_user', 'user_account_age', 'pronoun_density',
            'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
            'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
        ]
        Path(self.output_path).mkdir(parents=True, exist_ok=True)
        plt.figure(figsize=(14, 10))
        for i, feature in enumerate(key_features[6:11], 1):
            plt.subplot(2, 3, i)
            sns.boxplot(x='fake', y=feature, data=self.df, palette=['skyblue', 'salmon'])
            plt.title(f'{feature} Distribution', fontsize=12)
            plt.xlabel('Fake (0/1)', fontsize=10)
        plt.tight_layout()
        output_file = Path(self.output_path) / 'box_plots.png'
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        plt.close()
        logger.info(f"Saved box plots to {output_file}")

    def plot_scatter_review_grammar(self):
        Path(self.output_path).mkdir(parents=True, exist_ok=True)
        plt.figure(figsize=(10, 6))
        sns.scatterplot(x='review_stars', y='grammar_error_score', hue='fake', data=self.df, palette=['blue', 'red'], alpha=0.5)
        plt.title('Review Stars vs Grammar Error Score by Fake Label', fontsize=16)
        plt.xlabel('Review Stars', fontsize=12)
        plt.ylabel('Grammar Error Score', fontsize=12)
        plt.legend(title='Fake')
        plt.tight_layout()
        output_file = Path(self.output_path) / 'scatter_review_grammar.png'
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        plt.close()
        logger.info(f"Saved scatter plot to {output_file}")

    def plot_density_plots(self):
        key_features = [
            'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
            'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
            'time_since_last_review_user', 'user_account_age', 'pronoun_density',
            'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
            'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
        ]
        Path(self.output_path).mkdir(parents=True, exist_ok=True)
        plt.figure(figsize=(14, 10))
        for i, feature in enumerate(key_features[:4], 1):
            plt.subplot(2, 2, i)
            for label in [0, 1]:
                subset = self.df[self.df['fake'] == label]
                sns.kdeplot(subset[feature], label=f'Fake={label}', fill=True, alpha=0.5)
            plt.title(f'{feature} Density', fontsize=12)
            plt.xlabel(feature, fontsize=10)
            plt.legend()
        plt.tight_layout()
        output_file = Path(self.output_path) / 'density_plots.png'
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        plt.close()
        logger.info(f"Saved density plots to {output_file}")

    def plot_stacked_bar_similarity(self):
        Path(self.output_path).mkdir(parents=True, exist_ok=True)
        bins = pd.cut(self.df['similarity_to_other_reviews'], bins=10)
        stacked_data = self.df.groupby([bins, 'fake']).size().unstack(fill_value=0)
        stacked_data = stacked_data.div(stacked_data.sum(axis=1), axis=0)
        plt.figure(figsize=(12, 8))
        stacked_data.plot(kind='bar', stacked=True, color=['skyblue', 'salmon'], width=0.8)
        plt.title('Proportion of Fake by Similarity to Other Reviews Bins', fontsize=16)
        plt.xlabel('Similarity Bins', fontsize=12)
        plt.ylabel('Proportion', fontsize=12)
        plt.legend(['Genuine (0)', 'Fake (1)'], title='Fake Label')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        output_file = Path(self.output_path) / 'stacked_bar_similarity.png'
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        plt.close()
        logger.info(f"Saved stacked bar plot to {output_file}")

    def plot_pie_fake_distribution(self):
        Path(self.output_path).mkdir(parents=True, exist_ok=True)
        fake_counts = self.df['fake'].value_counts()
        plt.figure(figsize=(8, 8))
        plt.pie(fake_counts, labels=['Genuine (0)', 'Fake (1)'], colors=['skyblue', 'salmon'], autopct='%1.1f%%', startangle=90)
        plt.title('Distribution of Fake Labels', fontsize=16)
        plt.axis('equal')
        output_file = Path(self.output_path) / 'pie_fake_distribution.png'
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        plt.close()
        logger.info(f"Saved pie chart to {output_file}")

    def plot_count_code_switching(self):
        Path(self.output_path).mkdir(parents=True, exist_ok=True)
        plt.figure(figsize=(8, 6))
        sns.countplot(x='code_switching_flag', hue='fake', data=self.df, palette=['skyblue', 'salmon'])
        plt.title('Count of Fake by Code Switching Flag', fontsize=16)
        plt.xlabel('Code Switching Flag (0/1)', fontsize=12)
        plt.ylabel('Count', fontsize=12)
        plt.legend(title='Fake Label')
        plt.tight_layout()
        output_file = Path(self.output_path) / 'count_code_switching.png'
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        plt.close()
        logger.info(f"Saved count plot to {output_file}")

    def plot_variance_by_fake_bar(self):
        key_features = [
            'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
            'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
            'time_since_last_review_user', 'user_account_age', 'pronoun_density',
            'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
            'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
        ]
        Path(self.output_path).mkdir(parents=True, exist_ok=True)
        variance_by_fake = self.df.groupby('fake')[key_features].var().T
        variance_by_fake.columns = ['Genuine (0)', 'Fake (1)']
        plt.figure(figsize=(12, 8))
        variance_by_fake.plot(kind='bar', color=['skyblue', 'salmon'], width=0.8)
        plt.title('Feature Variance by Fake Label', fontsize=16)
        plt.xlabel('Features', fontsize=12)
        plt.ylabel('Variance', fontsize=12)
        plt.xticks(rotation=45, ha='right')
        plt.legend(title='Fake Label')
        plt.tight_layout()
        output_file = Path(self.output_path) / 'variance_by_fake_bar.png'
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        plt.close()
        logger.info(f"Saved variance bar plot to {output_file}")

    def run_pipeline(self):

        sns.set(style="whitegrid")
        plt.rcParams['figure.figsize'] = (12, 8)
        self.plot_correlation_heatmap()
        self.plot_mean_by_fake_bar()
        self.plot_violin_plots()
        self.plot_box_plots()
        self.plot_scatter_review_grammar()
        self.plot_density_plots()
        self.plot_stacked_bar_similarity()
        self.plot_pie_fake_distribution()
        self.plot_count_code_switching()
        self.plot_variance_by_fake_bar()