import re import json import plotly.express as px import pandas as pd def load_gender_lexicons(): with open("config/gender_lexicons.json", "r") as lexicon_file: gender_lexicons = json.load(lexicon_file) return gender_lexicons def count_gender_terms(text, gender_pattern): return len(gender_pattern.findall(text)) def get_gender_tag(count_male_terms, count_female_terms): total_terms = count_male_terms + count_female_terms if total_terms == 0: return "No Gender" male_proportion = (count_male_terms / total_terms) * 100 female_proportion = (count_female_terms / total_terms) * 100 if male_proportion >= 75: return "Male Strongly Positive Gender" elif male_proportion >= 50: return "Male Positive Gender" elif female_proportion >= 75: return "Female Strongly Positive Gender" elif female_proportion >= 50: return "Female Positive Gender" return "Equal Gender" def analyze_text(text, gender_lexicons): male_lexicon = set(gender_lexicons.get("male_lexicons")) female_lexicon = set(gender_lexicons.get("female_lexicons")) male_pattern = re.compile( r"\b({})\b".format("|".join(map(re.escape, male_lexicon))) ) female_pattern = re.compile( r"\b({})\b".format("|".join(map(re.escape, female_lexicon))) ) text = text.lower().strip() count_male_terms = count_gender_terms(text, male_pattern) count_female_terms = count_gender_terms(text, female_pattern) gender_category = get_gender_tag(count_male_terms, count_female_terms) return count_male_terms, count_female_terms, gender_category def plot_gender_category_counts(labels, values): fig = px.pie( values=values, names=labels, title="Gender Distribution", ) fig.update_traces( pull=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1], textinfo="percent+label", marker=dict( line=dict(color="#000000", width=1), ), ) fig.update_layout(showlegend=False) return fig def eval_gender_distribution(data): gender_lexicons = load_gender_lexicons() data["count_male_terms"], data["count_female_terms"], data["gender_category"] = zip( *data[data.columns[0]].apply(lambda x: analyze_text(x, gender_lexicons)) ) gender_labels = [ "No Gender", "Equal Gender", "Male Positive Gender", "Male Strongly Positive Gender", "Female Positive Gender", "Female Strongly Positive Gender", ] result_json = ( data["gender_category"].value_counts().reindex(gender_labels, fill_value=0) ) result_df = pd.DataFrame({"Metric": result_json.index, "Value": result_json.values}) result_plot = plot_gender_category_counts(gender_labels, result_json) return result_df, result_plot