Spaces:

avid-ml
/

biasaware

Running

App Files Files Community

freyam commited on Sep 11, 2023

Commit

e0db39e

1 Parent(s): 0321f34

Restructure Logic and Data Flow

Browse files

Files changed (15) hide show

README.md +1 -1
app.py +102 -74
config/gender_lexicons.json +28 -0
methodologies.json → config/methodologies.json +3 -3
config/profession_lexicons.json +156 -0
data/z_house.csv +0 -7
data/z_sentences.csv +11 -0
requirements.txt +1 -1
scripts/genbit.py +14 -0
scripts/genbit_metrics.py +0 -48
scripts/{gender_tagging.py → gender_divide.py} +64 -57
scripts/{gender_profession_tagging.py → gender_profession_bias.py} +50 -68
utils/config.json +0 -160
utils/load_csv.py +0 -23
utils/read_config.py +0 -13

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🦀
 colorFrom: indigo
 colorTo: yellow
 sdk: gradio
-sdk_version: 3.40.1
 app_file: app.py
 pinned: false
 license: mit

 colorFrom: indigo
 colorTo: yellow
 sdk: gradio
+sdk_version: 3.43.2
 app_file: app.py
 pinned: false
 license: mit

app.py CHANGED Viewed

@@ -3,78 +3,105 @@ import gradio as gr
 import pandas as pd
 import os
-from scripts.genbit_metrics import *
-from scripts.gender_profession_tagging import *
-from scripts.gender_tagging import *
-from utils.load_csv import *
-from utils.read_config import get_args
-methodologies = json.load(open("methodologies.json", "r"))
-def get_methodology_metadata(methodology):
-    title = "## " + methodology
-    description = methodologies.get(methodology).get("description")
-    metadata = f"{title}\n\n{description}"
-    return gr.Markdown.update(metadata, visible=True)
-def evaluate(dataset_file, dataset_scope, dataset_scope_n, dataset_column, methodology):
-    status = {}
-    dataset = pd.read_csv(dataset_file.name)
-    sample_method = dataset_scope
-    col_name = dataset_column
-    num_sample_records = dataset_scope_n
-    status = globals()[methodologies.get(methodology).get("fx")](
-        dataset, sample_method, col_name, num_sample_records
     )
-    return gr.JSON.update(status, visible=True)
-def process_dataset(dataset):
-    data = pd.read_csv(dataset.name)
-    columns = data.select_dtypes(include=["object"]).columns.tolist()
     return (
-        gr.Radio.update(
-            label="Scope",
-            info="Determines the scope of the dataset to be analyzed",
-            choices=["First", "Last", "Random"],
-            value="First",
-            visible=True,
-            interactive=True,
-        ),
-        gr.Slider.update(
-            label=f"Number of Entries",
-            info=f"Determines the number of entries to be analyzed. Due to computational constraints, the maximum number of entries that can be analyzed is {get_args('first_records')}.",
-            minimum=1,
-            maximum=min(data.shape[0], get_args("first_records")),
-            value=min(data.shape[0], get_args("first_records")) // 2,
-            visible=True,
-            interactive=True,
-        ),
-        gr.Radio.update(
-            label="Column",
-            info="Determines the column to be analyzed. These are the columns with text data.",
-            choices=columns,
-            value=columns[0],
-            visible=True,
-            interactive=True,
-        ),
-    )
-def get_column_metadata(dataset, column):
-    data = pd.read_csv(dataset.name)
-    corpus = data[column].head(10).tolist()
-    return gr.Dataframe.update(
-        value=pd.DataFrame({f"Data Corpus: {column}": corpus}), visible=True
     )
@@ -89,19 +116,19 @@ with BiasAware:
         with gr.Column(scale=2):
             gr.Markdown("## Dataset")
-            dataset_file = gr.File(label="Dataset")
             dataset_examples = gr.Examples(
                 [
                     os.path.join(os.path.dirname(__file__), "data/z_animal.csv"),
                     os.path.join(os.path.dirname(__file__), "data/z_employee.csv"),
-                    os.path.join(os.path.dirname(__file__), "data/z_house.csv"),
                 ],
                 inputs=dataset_file,
                 label="Example Datasets",
             )
-            dataset_scope = gr.Radio(visible=False)
-            dataset_scope_n = gr.Slider(visible=False)
             dataset_column = gr.Radio(visible=False)
             dataset_corpus = gr.Dataframe(
@@ -114,14 +141,10 @@ with BiasAware:
             methodology = gr.Radio(
                 label="Methodology",
                 info="Determines the methodology to be used for bias detection",
-                choices=[
-                    "Gender Divide (Term Identity Diversity)",
-                    "Gender Profession Bias (Lexical Evaluation)",
-                    "GenBiT (Microsoft Responsible AI Gender Bias Tool)",
-                ],
             )
-            evalButton = gr.Button("Run Evaluation")
             methodology_metadata = gr.Markdown(visible=False)
@@ -134,13 +157,18 @@ with BiasAware:
             )
     dataset_file.change(
-        fn=process_dataset,
         inputs=[dataset_file],
-        outputs=[dataset_scope, dataset_scope_n, dataset_column],
     )
     dataset_column.change(
-        fn=get_column_metadata,
         inputs=[dataset_file, dataset_column],
         outputs=[dataset_corpus],
     )
@@ -148,15 +176,15 @@ with BiasAware:
     methodology.change(
         fn=get_methodology_metadata,
         inputs=[methodology],
-        outputs=[methodology_metadata],
     )
     evalButton.click(
         fn=evaluate,
         inputs=[
             dataset_file,
-            dataset_scope,
-            dataset_scope_n,
             dataset_column,
             methodology,
         ],

 import pandas as pd
 import os
+from scripts.genbit import *
+from scripts.gender_profession_bias import *
+from scripts.gender_divide import *
+methodologies = json.load(open("config/methodologies.json", "r"))
+MAX_THRESHOLD = 1000
+def evaluate(dataset, sampling_method, sampling_size, column, methodology):
+    try:
+        print(
+            f"[{dataset.name.split('/')[-1]}::{column}] - {sampling_method} {sampling_size} entries"
+        )
+        data = pd.read_csv(dataset.name, usecols=[column])
+        if sampling_method == "First":
+            data = data.head(sampling_size)
+        elif sampling_method == "Last":
+            data = data.tail(sampling_size)
+        elif sampling_method == "Random":
+            data = data.sample(n=sampling_size, random_state=42)
+        result = globals()[methodologies.get(methodology).get("fx")](data)
+        return gr.JSON.update(result, visible=True)
+    except Exception as e:
+        return gr.JSON.update(
+            {
+                "error": f"An error occurred while processing the dataset. Please check the dataset and try again. Error: {e}"
+            },
+            visible=True,
+        )
+def display_dataset_config(dataset):
+    try:
+        data = pd.read_csv(dataset.name)
+        columns = data.select_dtypes(include=["object"]).columns.tolist()
+        corpus = data[columns[0]].tolist()
+        return (
+            gr.Radio.update(
+                label="Scope",
+                info="Determines the scope of the dataset to be analyzed",
+                choices=["First", "Last", "Random"],
+                value="First",
+                visible=True,
+                interactive=True,
+            ),
+            gr.Slider.update(
+                label=f"Number of Entries",
+                info=f"Determines the number of entries to be analyzed. Due to computational constraints, the maximum number of entries that can be analyzed is {MAX_THRESHOLD}.",
+                minimum=1,
+                maximum=min(data.shape[0], MAX_THRESHOLD),
+                value=min(data.shape[0], MAX_THRESHOLD) // 2,
+                visible=True,
+                interactive=True,
+            ),
+            gr.Radio.update(
+                label="Column",
+                info="Determines the column to be analyzed. These are the columns with text data.",
+                choices=columns,
+                value=columns[0],
+                visible=True,
+                interactive=True,
+            ),
+            gr.DataFrame.update(
+                value=pd.DataFrame({f"Data Corpus: {columns[0]}": corpus}), visible=True
+            ),
+        )
+    except:
+        return (
+            gr.Radio.update(visible=False),
+            gr.Slider.update(visible=False),
+            gr.Radio.update(visible=False),
+            gr.DataFrame.update(visible=False),
+        )
+def update_column_metadata(dataset, column):
+    data = pd.read_csv(dataset.name)
+    corpus = data[column].tolist()
+    return gr.Dataframe.update(
+        value=pd.DataFrame({f"Data Corpus: {column}": corpus}), visible=True
     )
+def get_methodology_metadata(methodology):
+    title = "## " + methodology
+    description = methodologies.get(methodology).get("description")
+    metadata = f"{title}\n\n{description}"
     return (
+        gr.Markdown.update(metadata, visible=True),
+        gr.Button.update(interactive=True, visible=True),
     )
         with gr.Column(scale=2):
             gr.Markdown("## Dataset")
+            dataset_file = gr.File(label="Dataset", file_types=["csv"])
             dataset_examples = gr.Examples(
                 [
                     os.path.join(os.path.dirname(__file__), "data/z_animal.csv"),
                     os.path.join(os.path.dirname(__file__), "data/z_employee.csv"),
+                    os.path.join(os.path.dirname(__file__), "data/z_sentences.csv"),
                 ],
                 inputs=dataset_file,
                 label="Example Datasets",
             )
+            dataset_sampling_method = gr.Radio(visible=False)
+            dataset_sampling_size = gr.Slider(visible=False)
             dataset_column = gr.Radio(visible=False)
             dataset_corpus = gr.Dataframe(
             methodology = gr.Radio(
                 label="Methodology",
                 info="Determines the methodology to be used for bias detection",
+                choices=methodologies.keys(),
             )
+            evalButton = gr.Button(value="Run Evaluation", interactive=False)
             methodology_metadata = gr.Markdown(visible=False)
             )
     dataset_file.change(
+        fn=display_dataset_config,
         inputs=[dataset_file],
+        outputs=[
+            dataset_sampling_method,
+            dataset_sampling_size,
+            dataset_column,
+            dataset_corpus,
+        ],
     )
     dataset_column.change(
+        fn=update_column_metadata,
         inputs=[dataset_file, dataset_column],
         outputs=[dataset_corpus],
     )
     methodology.change(
         fn=get_methodology_metadata,
         inputs=[methodology],
+        outputs=[methodology_metadata, evalButton],
     )
     evalButton.click(
         fn=evaluate,
         inputs=[
             dataset_file,
+            dataset_sampling_method,
+            dataset_sampling_size,
             dataset_column,
             methodology,
         ],

config/gender_lexicons.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "male_lexicons": [
+    "man",
+    "boy",
+    "male",
+    "he",
+    "son",
+    "his",
+    "himself",
+    "guy",
+    "father",
+    "john"
+  ],
+  "male_pronouns": ["he", "him", "his"],
+  "female_lexicons": [
+    "woman",
+    "girl",
+    "female",
+    "she",
+    "daughter",
+    "her",
+    "herself",
+    "gal",
+    "mother",
+    "mary"
+  ],
+  "female_pronouns": ["she", "her", "hers"]
+}

methodologies.json → config/methodologies.json RENAMED Viewed

@@ -1,14 +1,14 @@
 {
   "Gender Divide (Term Identity Diversity)": {
     "description": "333",
-    "fx": "load_dataset_and_analyze_gender_tag"
   },
   "Gender Profession Bias (Lexical Evaluation)": {
     "description": "This approach to addressing gender bias in language places a strong emphasis on a fundamental shift in detection and mitigation strategies.\n- Instead of solely relying on traditional frequency-based methods, this approach adopts a more nuanced perspective, prioritizing features within the text that consider contextual and semantic cues. It recognizes that gender bias extends beyond mere word frequency and delves into how language is structured and how it reinforces gender stereotypes.\n- Even with advanced models like Word Embedding and Contextual Word Embedding, which capture more complex language features, there's still a risk of inheriting biases from training data.\n- To tackle this, this approach advocates for a data-driven strategy, involving the collection and labeling of datasets encompassing various subtypes of bias, using a comprehensive taxonomy for precise categorization.",
-    "fx": "load_dataset_and_analyze_gender_profession"
   },
   "GenBiT (Microsoft Responsible AI Gender Bias Tool)": {
     "description": "[GenBiT](https://www.microsoft.com/en-us/research/uploads/prod/2021/10/MSJAR_Genbit_Final_Version-616fd3a073758.pdf) is a versatile tool designed to address gender bias in language datasets by utilizing word co-occurrence statistical methods to measure bias. It introduces a novel approach to mitigating gender bias by combining contextual data augmentation, random sampling, sentence classification, and targeted gendered data filtering.\n- The primary goal is to reduce historical gender biases within conversational parallel multilingual datasets, ultimately enhancing the fairness and inclusiveness of machine learning model training and its subsequent applications.\n- What sets GenBiT apart is its adaptability to various forms of bias, not limited to gender alone. It can effectively address biases related to race, religion, or other dimensions, making it a valuable generic tool for bias mitigation in language datasets.\n- GenBiT's impact extends beyond bias reduction metrics; it has shown positive results in improving the performance of machine learning classifiers like Support Vector Machine(SVM). Augmented datasets produced by GenBiT yield significant enhancements in f1-score when compared to the original datasets, underlining its practical benefits in machine learning applications.",
-    "fx": "load_dataset_and_get_genbit_metrics"
   }
 }

 {
   "Gender Divide (Term Identity Diversity)": {
     "description": "333",
+    "fx": "eval_gender_divide"
   },
   "Gender Profession Bias (Lexical Evaluation)": {
     "description": "This approach to addressing gender bias in language places a strong emphasis on a fundamental shift in detection and mitigation strategies.\n- Instead of solely relying on traditional frequency-based methods, this approach adopts a more nuanced perspective, prioritizing features within the text that consider contextual and semantic cues. It recognizes that gender bias extends beyond mere word frequency and delves into how language is structured and how it reinforces gender stereotypes.\n- Even with advanced models like Word Embedding and Contextual Word Embedding, which capture more complex language features, there's still a risk of inheriting biases from training data.\n- To tackle this, this approach advocates for a data-driven strategy, involving the collection and labeling of datasets encompassing various subtypes of bias, using a comprehensive taxonomy for precise categorization.",
+    "fx": "eval_gender_profession"
   },
   "GenBiT (Microsoft Responsible AI Gender Bias Tool)": {
     "description": "[GenBiT](https://www.microsoft.com/en-us/research/uploads/prod/2021/10/MSJAR_Genbit_Final_Version-616fd3a073758.pdf) is a versatile tool designed to address gender bias in language datasets by utilizing word co-occurrence statistical methods to measure bias. It introduces a novel approach to mitigating gender bias by combining contextual data augmentation, random sampling, sentence classification, and targeted gendered data filtering.\n- The primary goal is to reduce historical gender biases within conversational parallel multilingual datasets, ultimately enhancing the fairness and inclusiveness of machine learning model training and its subsequent applications.\n- What sets GenBiT apart is its adaptability to various forms of bias, not limited to gender alone. It can effectively address biases related to race, religion, or other dimensions, making it a valuable generic tool for bias mitigation in language datasets.\n- GenBiT's impact extends beyond bias reduction metrics; it has shown positive results in improving the performance of machine learning classifiers like Support Vector Machine(SVM). Augmented datasets produced by GenBiT yield significant enhancements in f1-score when compared to the original datasets, underlining its practical benefits in machine learning applications.",
+    "fx": "eval_genbit"
   }
 }

config/profession_lexicons.json ADDED Viewed

	@@ -0,0 +1,156 @@

+{
+  "professions": [
+    "Accountant",
+    "Actor",
+    "Actress",
+    "Aerospace Engineer",
+    "Agricultural Scientist",
+    "Air Traffic Controller",
+    "Aircraft Mechanic",
+    "Animator",
+    "Architect",
+    "Art Director",
+    "Attorney",
+    "Lawyer",
+    "Audiologist",
+    "Author",
+    "Writer",
+    "Baker",
+    "Barber",
+    "Hairdresser",
+    "Bartender",
+    "Biomedical Engineer",
+    "Botanist",
+    "Broadcast Journalist",
+    "Business Analyst",
+    "Carpenter",
+    "Chef",
+    "Cook",
+    "Chemist",
+    "Civil Engineer",
+    "Clinical Psychologist",
+    "Commercial Diver",
+    "Computer Programmer",
+    "Construction Worker",
+    "Corporate Trainer",
+    "Cosmetologist",
+    "Counselor",
+    "Therapist",
+    "Court Reporter",
+    "Creative Director",
+    "Criminologist",
+    "Customer Service Representative",
+    "Data Analyst",
+    "Dental Assistant",
+    "Dentist",
+    "Dermatologist",
+    "Dietician",
+    "Nutritionist",
+    "Doctor",
+    "Physician",
+    "Economist",
+    "Electrician",
+    "Elementary School Teacher",
+    "Emergency Medical Technician",
+    "Engineer",
+    "Environmental Scientist",
+    "Event Planner",
+    "Fashion Designer",
+    "Film Director",
+    "Financial Analyst",
+    "Firefighter",
+    "Fisherman",
+    "Fitness Trainer",
+    "Flight Attendant",
+    "Florist",
+    "Food Scientist",
+    "Forensic Scientist",
+    "Furniture Maker",
+    "Game Developer",
+    "Gardener",
+    "Landscaper",
+    "Geologist",
+    "Graphic Designer",
+    "Hair Stylist",
+    "Historian",
+    "Home Health Aide",
+    "Hotel Manager",
+    "Human Resources Manager",
+    "Immigration Lawyer",
+    "Industrial Designer",
+    "Insurance Agent",
+    "Interior Designer",
+    "Interpreter",
+    "Translator",
+    "Investment Banker",
+    "IT Specialist",
+    "Journalist",
+    "Judge",
+    "Kindergarten Teacher",
+    "Land Surveyor",
+    "Landscape Architect",
+    "Lawyer",
+    "Attorney",
+    "Librarian",
+    "Life Coach",
+    "Linguist",
+    "Makeup Artist",
+    "Management Consultant",
+    "Manufacturing Engineer",
+    "Marine Biologist",
+    "Marketing Manager",
+    "Massage Therapist",
+    "Mechanical Engineer",
+    "Medical Assistant",
+    "Medical Researcher",
+    "Meteorologist",
+    "Midwife",
+    "Military Officer",
+    "Music Producer",
+    "Musician",
+    "Nurse",
+    "Occupational Therapist",
+    "Optician",
+    "Optometrist",
+    "Paralegal",
+    "Paramedic",
+    "Patent Attorney",
+    "Pediatrician",
+    "Personal Trainer",
+    "Petroleum Engineer",
+    "Pharmacist",
+    "Photographer",
+    "Physical Therapist",
+    "Physician Assistant",
+    "Pilot",
+    "Plumber",
+    "Police Officer",
+    "Political Scientist",
+    "Preschool Teacher",
+    "Private Investigator",
+    "Product Manager",
+    "Professor",
+    "Lecturer",
+    "Programmer",
+    "Psychiatrist",
+    "Psychologist",
+    "Public Relations Specialist",
+    "Public School Teacher",
+    "Real Estate Agent",
+    "Broker",
+    "Receptionist",
+    "Registered Nurse",
+    "Reporter",
+    "Restaurant Manager",
+    "Sales Representative",
+    "School Counselor",
+    "Scientist",
+    "Screenwriter",
+    "Social Media Manager",
+    "Social Worker",
+    "Software Developer",
+    "Speech-Language Pathologist",
+    "Sports Coach",
+    "Statistician"
+  ]
+}

data/z_house.csv DELETED Viewed

@@ -1,7 +0,0 @@
-PropertyID,StreetAddress,City,State,ZipCode,NumberOfBedrooms,NumberOfBathrooms,SquareFootage,Price
-1,123 Main St,Los Angeles,CA,90001,3,2,1800,550000
-2,456 Elm St,New York,NY,10001,2,1,1200,750000
-3,789 Oak St,San Francisco,CA,94101,4,3,2500,950000
-4,101 Maple St,Boston,MA,02101,3,2.5,2000,680000
-5,202 Pine St,Miami,FL,33101,4,3.5,2700,820000
-6,303 Cedar St,Chicago,IL,60601,2,1,1100,450000

data/z_sentences.csv ADDED Viewed

	@@ -0,0 +1,11 @@

+ID,Sentences,Gender Bias Test
+1,The doctor examined the patient.,Neutral
+2,The teacher helped the student with their homework.,Neutral
+3,The engineer designed the new software.,Neutral
+4,The nurse cared for the sick child.,Neutral
+5,The chef prepared a delicious meal.,Neutral
+6,The firefighter bravely rescued people from the burning building.,Neutral
+7,The police officer maintained law and order in the city.,Neutral
+8,The scientist conducted groundbreaking research.,Neutral
+9,The artist created a beautiful masterpiece.,Neutral
+10,The CEO made important decisions for the company.,Neutral

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-gradio==3.40.1
 gradio_client==0.5.0
 numpy==1.25.2
 pandas==2.0.3

+gradio==3.43.2
 gradio_client==0.5.0
 numpy==1.25.2
 pandas==2.0.3

scripts/genbit.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from genbit.genbit_metrics import GenBitMetrics
+def eval_genbit(data):
+    genbit_metrics = GenBitMetrics(
+        language_code="en", context_window=5, distance_weight=0.95, percentile_cutoff=80
+    )
+    data[data.columns[0]] = data[data.columns[0]].to_list()
+    genbit_metrics.add_data(data, tokenized=False)
+    genbit_metrics = genbit_metrics.get_metrics(output_word_list=False)
+    return genbit_metrics

scripts/genbit_metrics.py DELETED Viewed

@@ -1,48 +0,0 @@
-from genbit.genbit_metrics import GenBitMetrics
-import pandas as pd
-from utils.read_config import get_args
-from utils.load_csv import load_sample
-def cal_metrics(dataset):
-  # Create a GenBit object with the desired settings:
-  genbit_metrics_object = GenBitMetrics(language_code="en", context_window=5, distance_weight=0.95, percentile_cutoff=80)
-  # Let's say you want to use GenBit with a test sentence, you can add the sentence to GenBit:
-  #dataset = ["I think she does not like cats. I think he does not like cats.", "He is a dog person."]
-  genbit_metrics_object.add_data(dataset, tokenized=False)
-  # To generate the gender bias metrics, we run `get_metrics` by setting `output_statistics` and `output_word_lists` to false, we can reduce the number of metrics created.
-  metrics = genbit_metrics_object.get_metrics(output_statistics=True, output_word_list=True)
-  return metrics
-# Function to extract genbit metrics
-def extract_genbit_metris(stats):
-    metrics = {}
-    metrics["genbit_score"] = str(stats["genbit_score"])
-    metrics["percentage_of_female_gender_definition_words"] = str(stats["percentage_of_female_gender_definition_words"])
-    metrics["percentage_of_male_gender_definition_words"] = str(stats["percentage_of_male_gender_definition_words"])
-    metrics["percentage_of_non_binary_gender_definition_words"] = str(stats["percentage_of_non_binary_gender_definition_words"])
-    metrics["percentage_of_trans_gender_definition_words"] = str(stats["percentage_of_trans_gender_definition_words"])
-    metrics["percentage_of_cis_gender_definition_words"] = str(stats["percentage_of_cis_gender_definition_words"])
-    metrics["num_words_considered"] = str(stats["statistics"]["num_words_considered"])
-    return metrics
-def load_dataset_and_get_genbit_metrics(df, sample_method, col_name, num_sample_records):
-    sample_df = load_sample(num_sample_records, sample_method, df, col_name)
-    # Turn into a list of text.
-    sample_text = sample_df[col_name].tolist()
-    # Call cal_metrics function
-    stats = cal_metrics(sample_text)
-    metrics = extract_genbit_metris(stats)
-    return metrics

scripts/{gender_tagging.py → gender_divide.py} RENAMED Viewed

@@ -1,26 +1,23 @@
-# Import required libraries
-import pandas as pd
 import re
-from utils.read_config import get_args
-from utils.load_csv import load_sample
-# Function to get count of male terms in text
 def count_male_terms(text, male_terms):
-  # Get pattern
-  pattern = r"\b({})\b".format("|".join(male_terms))
-  match = re.findall(pattern, str(text))
-  return len(match)
-# Function to get count of female terms in text
 def count_female_terms(text, female_terms):
-  # Get pattern
-  pattern = r"\b({})\b".format("|".join(female_terms))
-  match = re.findall(pattern, str(text))
-  return len(match)
-# Function to get gender tag categories
 def get_gender_tag(count_m_term, count_f_term):
-    tag = ''
     if count_m_term == 0 and count_f_term == 0:
         tag = "No Gender"
@@ -44,50 +41,60 @@ def get_gender_tag(count_m_term, count_f_term):
     return tag
-# Function to calculate PG and SPG
 def get_pg_spg(sample_df):
-    count_no_gender_sentences = sample_df[sample_df["gender_cat"] == "No Gender"]['gender_cat'].count()
-    count_gender_sentences = sample_df[sample_df["gender_cat"] != "No Gender"]['gender_cat'].count()
-    count_equal_gender = sample_df[sample_df["gender_cat"] == "Equal Gender"]['gender_cat'].count()
-    count_male_pg = sample_df[sample_df['gender_cat'] == "Male Positive Gender"]['gender_cat'].count()
-    count_male_spg = sample_df[sample_df['gender_cat'] == "Male Strongly Positive Gender"]['gender_cat'].count()
-    count_female_pg = sample_df[sample_df['gender_cat'] == "Female Positive Gender"]['gender_cat'].count()
-    count_female_spg = sample_df[sample_df['gender_cat'] == "Female Stronly Positive Gender"]['gender_cat'].count()
     return {
-    "gender" : str(count_gender_sentences),
-    "no gender" : str(count_no_gender_sentences),
-    "equal gender" : str(count_equal_gender),
-    "female pg" : str(count_female_pg),
-    "male pg" : str(count_male_pg),
-    "female spg" : str(count_female_spg),
-    "male spg" : str(count_male_spg)
     }
-# Function to load dataset and get the analysis done
-def load_dataset_and_analyze_gender_tag(df, sample_method, col_name, num_sample_records):
-    # Read config file
-    male_terms = get_args("male_terms")
-    female_terms = get_args("female_terms")
-    # Load sample
-    sample_df = load_sample(num_sample_records, sample_method, df, col_name)
-    # Lowercase of text
-    sample_df[col_name] = sample_df[col_name].str.lower().str.strip()
-    # Get new columns of count - male terms and female terms
-    sample_df['count_male_term'] = sample_df.apply(lambda x : count_male_terms(x[col_name], male_terms), axis=1)
-    sample_df['count_female_term'] = sample_df.apply(lambda x : count_female_terms(x[:], female_terms), axis=1)
-    # Get tag categories
-    sample_df['gender_cat'] = sample_df.apply(lambda row: get_gender_tag(row['count_male_term'], row['count_female_term']), axis=1)
-    # Get statistics
-    collection = get_pg_spg(sample_df)
-    return collection

 import re
+import json
+gender_lexicons = json.load(open("config/gender_lexicons.json", "r"))
 def count_male_terms(text, male_terms):
+    pattern = r"\b({})\b".format("|".join(male_terms))
+    match = re.findall(pattern, str(text))
+    return len(match)
 def count_female_terms(text, female_terms):
+    pattern = r"\b({})\b".format("|".join(female_terms))
+    match = re.findall(pattern, str(text))
+    return len(match)
 def get_gender_tag(count_m_term, count_f_term):
+    tag = ""
     if count_m_term == 0 and count_f_term == 0:
         tag = "No Gender"
     return tag
 def get_pg_spg(sample_df):
+    count_no_gender_sentences = sample_df[sample_df["gender_cat"] == "No Gender"][
+        "gender_cat"
+    ].count()
+    count_gender_sentences = sample_df[sample_df["gender_cat"] != "No Gender"][
+        "gender_cat"
+    ].count()
+    count_equal_gender = sample_df[sample_df["gender_cat"] == "Equal Gender"][
+        "gender_cat"
+    ].count()
+    count_male_pg = sample_df[sample_df["gender_cat"] == "Male Positive Gender"][
+        "gender_cat"
+    ].count()
+    count_male_spg = sample_df[
+        sample_df["gender_cat"] == "Male Strongly Positive Gender"
+    ]["gender_cat"].count()
+    count_female_pg = sample_df[sample_df["gender_cat"] == "Female Positive Gender"][
+        "gender_cat"
+    ].count()
+    count_female_spg = sample_df[
+        sample_df["gender_cat"] == "Female Stronly Positive Gender"
+    ]["gender_cat"].count()
     return {
+        "gender": str(count_gender_sentences),
+        "no gender": str(count_no_gender_sentences),
+        "equal gender": str(count_equal_gender),
+        "female pg": str(count_female_pg),
+        "male pg": str(count_male_pg),
+        "female spg": str(count_female_spg),
+        "male spg": str(count_male_spg),
     }
+def eval_gender_divide(data):
+    male_terms = gender_lexicons.get("male_lexicons")
+    female_terms = gender_lexicons.get("female_lexicons")
+    data[data.columns[0]] = data[data.columns[0]].str.lower().str.strip()
+    data["count_male_term"] = data.apply(
+        lambda x: count_male_terms(x[data.columns[0]], male_terms), axis=1
+    )
+    data["count_female_term"] = data.apply(
+        lambda x: count_female_terms(x[:], female_terms), axis=1
+    )
+    data["gender_cat"] = data.apply(
+        lambda row: get_gender_tag(row["count_male_term"], row["count_female_term"]),
+        axis=1,
+    )
+    collection = get_pg_spg(data)
+    return collection

scripts/{gender_profession_tagging.py → gender_profession_bias.py} RENAMED Viewed

@@ -1,43 +1,43 @@
-import pandas as pd
 import re
-import spacy
-from spacy.lang.en import English
-import time
-from tqdm import tqdm
-import multiprocessing.pool
-import warnings
-warnings.filterwarnings("ignore")
-from utils.read_config import get_args
-from utils.load_csv import load_sample
-# For sentence split
 nlp = English()
 nlp.add_pipe("sentencizer")
-# Function to split sentences
-def get_split_text(text):
     doc = nlp(text)
     sentences = [sent for sent in doc.sents]
     return sentences
-def get_gender_prof_match_details(df_text):
-    # Get args from config file
-    male_pronoun = get_args("male_pronoun")
-    female_pronoun = get_args("female_pronoun")
-    professions = get_args("professions")
-    # Get regex pattern
-    male_pronoun_pat, female_pronoun_pat, professions_pat = get_regex_pattern(male_pronoun, female_pronoun, professions)
     split_text = get_split_text(df_text)
     results = []
     for text in split_text:
         male_pronoun_match = re.findall(male_pronoun_pat, str(text))
         female_pronoun_match = re.findall(female_pronoun_pat, str(text))
@@ -52,78 +52,60 @@ def get_gender_prof_match_details(df_text):
         if len(female_pronoun_match) != 0 and len(prof_match) != 0:
             both_match = "Yes"
-        # Unpack from list
         male_pronoun_match = ",".join(male_pronoun_match)
         female_pronoun_match = ",".join(female_pronoun_match)
         prof_match = ",".join(prof_match)
-        results.append((str(text), male_pronoun_match, female_pronoun_match, prof_match, both_match))
     return results
-# Function to call multiprocessing threadpool
 def call_multiprocessing_pool(df_text):
     concurrent = 2000
     pool = multiprocessing.pool.ThreadPool(processes=concurrent)
     result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
     pool.close()
-    # return_list is nested -- we need to flatten it
     flat_return_list = [item for sublist in result_list for item in sublist]
-    # add column names
-    cols = ["Split_Text", 'Male Pronoun', 'Female Pronoun', 'Profession', "Both Match"]
     return_df = pd.DataFrame(flat_return_list, columns=cols)
     return return_df
-# Function to get statistics
-def get_statistics(results_df):
-    count_total_sentence = results_df.shape[0]
-    count_both_match = results_df[results_df["Both Match"] == "Yes"]['Both Match'].count()
-    count_male_pronoun = results_df[results_df["Male Pronoun"] != ""]["Male Pronoun"].count()
-    count_female_pronoun = results_df[results_df["Female Pronoun"] != ""]["Female Pronoun"].count()
-    count_male_pronoun_profession = results_df[(results_df["Male Pronoun"] != "") & (results_df["Profession"] != "")]["Male Pronoun"].count()
-    count_female_pronoun_profession = results_df[(results_df["Female Pronoun"] != "") & (results_df["Profession"] != "")]["Female Pronoun"].count()
-    return{
-        "total_sentence" : str(count_total_sentence),
-        "both_gender_prof_match" : str(count_both_match),
-        "count_male_pronoun" : str(count_male_pronoun),
-        "count_female_pronoun" : str(count_female_pronoun),
-        "count_male_pronoun_profession" : str(count_male_pronoun_profession),
-        "count_female_pronoun_profession" : str(count_female_pronoun_profession)
-    }
-# Function to return regular expression patterns
-def get_regex_pattern(male_pronoun, female_pronoun, professions):
-    male_pronoun_pat = r'\b({})\b'.format("|".join(male_pronoun))
-    female_pronoun_pat = r'\b({})\b'.format("|".join(female_pronoun))
-    #Lower case male professioon
-    professions = [prof.lower() for prof in professions]
-    professions_pat = r'\b({})\b'.format("|".join(professions))
-    return male_pronoun_pat, female_pronoun_pat, professions_pat
-def load_dataset_and_analyze_gender_profession(df, sample_method, col_name, num_sample_records):
-    # Get args from config file
-    sample_df = load_sample(num_sample_records, sample_method, df, col_name)
-    # Lowercase of text
-    sample_df[col_name] = sample_df[col_name].str.lower().str.strip()
-    # Call multiple threadpool
-    results_df = call_multiprocessing_pool(sample_df[col_name])
-    stats = get_statistics(results_df)
-    # Get statistics
-    return stats

 import re
+import json
+import pandas as pd
+import multiprocessing.pool
+from spacy.lang.en import English
+gender_lexicons = json.load(open("config/gender_lexicons.json", "r"))
+profession_lexicons = json.load(open("config/profession_lexicons.json", "r"))
 nlp = English()
 nlp.add_pipe("sentencizer")
+def get_split_text(text):
     doc = nlp(text)
     sentences = [sent for sent in doc.sents]
     return sentences
+def compile_regex_patterns(patterns):
+    return [
+        re.compile(r"\b({})\b".format("|".join(pattern)), flags=re.IGNORECASE)
+        for pattern in patterns
+    ]
+def get_gender_prof_match_details(df_text):
+    male_pronouns = gender_lexicons.get("male_pronouns")
+    female_pronouns = gender_lexicons.get("female_pronouns")
+    professions = profession_lexicons.get("professions")
+    male_pronoun_pat, female_pronoun_pat, professions_pat = compile_regex_patterns(
+        [male_pronouns, female_pronouns, professions]
+    )
     split_text = get_split_text(df_text)
     results = []
     for text in split_text:
         male_pronoun_match = re.findall(male_pronoun_pat, str(text))
         female_pronoun_match = re.findall(female_pronoun_pat, str(text))
         if len(female_pronoun_match) != 0 and len(prof_match) != 0:
             both_match = "Yes"
         male_pronoun_match = ",".join(male_pronoun_match)
         female_pronoun_match = ",".join(female_pronoun_match)
         prof_match = ",".join(prof_match)
+        results.append(
+            (
+                str(text),
+                male_pronoun_match,
+                female_pronoun_match,
+                prof_match,
+                both_match,
+            )
+        )
     return results
 def call_multiprocessing_pool(df_text):
     concurrent = 2000
     pool = multiprocessing.pool.ThreadPool(processes=concurrent)
     result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
     pool.close()
     flat_return_list = [item for sublist in result_list for item in sublist]
+    cols = ["Split Text", "Male Pronoun", "Female Pronoun", "Profession", "Both Match"]
     return_df = pd.DataFrame(flat_return_list, columns=cols)
     return return_df
+def get_statistics(result):
+    conditions = {
+        "both_gender_prof_match": result["Both Match"].eq("Yes"),
+        "count_male_pronoun": result["Male Pronoun"].ne(""),
+        "count_female_pronoun": result["Female Pronoun"].ne(""),
+        "count_male_pronoun_profession": result["Male Pronoun"].ne("")
+        & result["Profession"].ne(""),
+        "count_female_pronoun_profession": result["Female Pronoun"].ne("")
+        & result["Profession"].ne(""),
+    }
+    stats = {key: str(value.sum()) for key, value in conditions.items()}
+    stats["total_sentence"] = str(len(result))
+    return stats
+def eval_gender_profession(data):
+    data = data[data.columns[0]].str.lower().str.strip()
+    result = call_multiprocessing_pool(data)
+    stats = get_statistics(result)
+    return stats

utils/config.json DELETED Viewed

@@ -1,160 +0,0 @@
-{
-"first_records" : 2000,
-"random_seed" : 42,
-"male_terms" : ["man", "boy", "male", "he", "son", "his", "himself", "guy", "father", "john"],
-"female_terms" : ["woman", "girl", "female", "she", "daughter", "her", "herself", "gal", "mother", "mary"],
-"male_pronoun" : ["he", "him", "his"],
-"female_pronoun" : ["she", "her", "hers"],
-"professions" : ["Accountant",
-"Actor",
-"Actress",
-"Aerospace Engineer",
-"Agricultural Scientist",
-"Air Traffic Controller",
-"Aircraft Mechanic",
-"Animator",
-"Architect",
-"Art Director",
-"Attorney",
-"Lawyer",
-"Audiologist",
-"Author",
-"Writer",
-"Baker",
-"Barber",
-"Hairdresser",
-"Bartender",
-"Biomedical Engineer",
-"Botanist",
-"Broadcast Journalist",
-"Business Analyst",
-"Carpenter",
-"Chef",
-"Cook",
-"Chemist",
-"Civil Engineer",
-"Clinical Psychologist",
-"Commercial Diver",
-"Computer Programmer",
-"Construction Worker",
-"Corporate Trainer",
-"Cosmetologist",
-"Counselor",
-"Therapist",
-"Court Reporter",
-"Creative Director",
-"Criminologist",
-"Customer Service Representative",
-"Data Analyst",
-"Dental Assistant",
-"Dentist",
-"Dermatologist",
-"Dietician",
-"Nutritionist",
-"Doctor",
-"Physician",
-"Economist",
-"Electrician",
-"Elementary School Teacher",
-"Emergency Medical Technician",
-"Engineer",
-"Environmental Scientist",
-"Event Planner",
-"Fashion Designer",
-"Film Director",
-"Financial Analyst",
-"Firefighter",
-"Fisherman",
-"Fitness Trainer",
-"Flight Attendant",
-"Florist",
-"Food Scientist",
-"Forensic Scientist",
-"Furniture Maker",
-"Game Developer",
-"Gardener",
-"Landscaper",
-"Geologist",
-"Graphic Designer",
-"Hair Stylist",
-"Historian",
-"Home Health Aide",
-"Hotel Manager",
-"Human Resources Manager",
-"Immigration Lawyer",
-"Industrial Designer",
-"Insurance Agent",
-"Interior Designer",
-"Interpreter",
-"Translator",
-"Investment Banker",
-"IT Specialist",
-"Journalist",
-"Judge",
-"Kindergarten Teacher",
-"Land Surveyor",
-"Landscape Architect",
-"Lawyer",
-"Attorney",
-"Librarian",
-"Life Coach",
-"Linguist",
-"Makeup Artist",
-"Management Consultant",
-"Manufacturing Engineer",
-"Marine Biologist",
-"Marketing Manager",
-"Massage Therapist",
-"Mechanical Engineer",
-"Medical Assistant",
-"Medical Researcher",
-"Meteorologist",
-"Midwife",
-"Military Officer",
-"Music Producer",
-"Musician",
-"Nurse",
-"Occupational Therapist",
-"Optician",
-"Optometrist",
-"Paralegal",
-"Paramedic",
-"Patent Attorney",
-"Pediatrician",
-"Personal Trainer",
-"Petroleum Engineer",
-"Pharmacist",
-"Photographer",
-"Physical Therapist",
-"Physician Assistant",
-"Pilot",
-"Plumber",
-"Police Officer",
-"Political Scientist",
-"Preschool Teacher",
-"Private Investigator",
-"Product Manager",
-"Professor",
-"Lecturer",
-"Programmer",
-"Psychiatrist",
-"Psychologist",
-"Public Relations Specialist",
-"Public School Teacher",
-"Real Estate Agent",
-"Broker",
-"Receptionist",
-"Registered Nurse",
-"Reporter",
-"Restaurant Manager",
-"Sales Representative",
-"School Counselor",
-"Scientist",
-"Screenwriter",
-"Social Media Manager",
-"Social Worker",
-"Software Developer",
-"Speech-Language Pathologist",
-"Sports Coach",
-"Statistician"]
-}

utils/load_csv.py DELETED Viewed

@@ -1,23 +0,0 @@
-import pandas as pd
-from utils.read_config import get_args
-# Function to load sample of dataset
-def load_sample(num_sample_records, sample_method, df, col_name):
-    sample_first_records = get_args("first_records")
-    sample_random_seed = get_args("random_seed")
-    num_sample_records = num_sample_records if num_sample_records <= sample_first_records else sample_first_records
-    # Keep only required column
-    df = df[[col_name]]
-    if sample_method == "First":
-        df = df.iloc[:num_sample_records].copy().reset_index()
-    if sample_method == "Last":
-        df = df.iloc[-num_sample_records:].copy().reset_index()
-    if sample_method == "Random":
-        df = df.sample(num_sample_records,
-                       random_state=sample_random_seed).copy().reset_index()
-    return df

utils/read_config.py DELETED Viewed

@@ -1,13 +0,0 @@
-import json
-def read_config_file():
-    with open("utils/config.json", "r") as jsonfile:
-        data = json.load(jsonfile)
-    return data
-def get_args(args):
-    try:
-        data = read_config_file()
-    except:
-        raise "Could not read config file."
-    return data[args]