sudipta002 commited on
Commit
b20457e
1 Parent(s): 239a9e5

Add backend scripts

Browse files
.gitignore CHANGED
@@ -1,5 +1,7 @@
1
  # Byte-compiled / optimized / DLL files
2
  __pycache__/
 
 
3
  *.py[cod]
4
  *$py.class
5
 
 
1
  # Byte-compiled / optimized / DLL files
2
  __pycache__/
3
+ testing/
4
+ check_gender_tagging.py
5
  *.py[cod]
6
  *$py.class
7
 
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
- gradio==3.41.2
2
  gradio_client==0.5.0
3
  numpy==1.25.2
4
- pandas==2.0.3
 
 
1
+ gradio==3.40.1
2
  gradio_client==0.5.0
3
  numpy==1.25.2
4
+ pandas==2.0.3
5
+ spacy
scripts/.keep DELETED
File without changes
scripts/gender_profession_tagging.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import re
3
+ import spacy
4
+ from spacy.lang.en import English
5
+ import time
6
+ from tqdm import tqdm
7
+ import multiprocessing.pool
8
+
9
+ import warnings
10
+ warnings.filterwarnings("ignore")
11
+ from utils.read_config import get_args
12
+
13
+
14
+ # For sentence split
15
+ nlp = English()
16
+ nlp.add_pipe("sentencizer")
17
+
18
+ # Function to split sentences
19
+ def get_split_text(text):
20
+
21
+ doc = nlp(text)
22
+ sentences = [sent for sent in doc.sents]
23
+ return sentences
24
+
25
+ def get_gender_prof_match_details(df_text):
26
+
27
+ # Get args from config file
28
+ male_pronoun = get_args("male_pronoun")
29
+ female_pronoun = get_args("female_pronoun")
30
+ professions = get_args("professions")
31
+
32
+ # Get regex pattern
33
+ male_pronoun_pat, female_pronoun_pat, professions_pat = get_regex_pattern(male_pronoun, female_pronoun, professions)
34
+
35
+
36
+ split_text = get_split_text(df_text)
37
+
38
+ results = []
39
+
40
+ for text in split_text:
41
+ male_pronoun_match = re.findall(male_pronoun_pat, str(text))
42
+ female_pronoun_match = re.findall(female_pronoun_pat, str(text))
43
+
44
+ prof_match = re.findall(professions_pat, str(text))
45
+
46
+ both_match = "No"
47
+
48
+ if len(male_pronoun_match) != 0 and len(prof_match) != 0:
49
+ both_match = "Yes"
50
+
51
+ if len(female_pronoun_match) != 0 and len(prof_match) != 0:
52
+ both_match = "Yes"
53
+
54
+ # Unpack from list
55
+ male_pronoun_match = ",".join(male_pronoun_match)
56
+ female_pronoun_match = ",".join(female_pronoun_match)
57
+
58
+ prof_match = ",".join(prof_match)
59
+
60
+ results.append((str(text), male_pronoun_match, female_pronoun_match, prof_match, both_match))
61
+
62
+ return results
63
+
64
+ # Function to call multiprocessing threadpool
65
+ def call_multiprocessing_pool(df_text):
66
+ concurrent = 2000
67
+ pool = multiprocessing.pool.ThreadPool(processes=concurrent)
68
+ result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
69
+ pool.close()
70
+
71
+ # return_list is nested -- we need to flatten it
72
+ flat_return_list = [item for sublist in result_list for item in sublist]
73
+
74
+ # add column names
75
+ cols = ["Split_Text", 'Male Pronoun', 'Female Pronoun', 'Profession', "Both Match"]
76
+ return_df = pd.DataFrame(flat_return_list, columns=cols)
77
+
78
+ return return_df
79
+
80
+ # Function to get statistics
81
+ def get_statistics(results_df):
82
+ count_total_sentence = results_df.shape[0]
83
+ count_both_match = results_df[results_df["Both Match"] == "Yes"]['Both Match'].count()
84
+ count_male_pronoun = results_df[results_df["Male Pronoun"] != ""]["Male Pronoun"].count()
85
+ count_female_pronoun = results_df[results_df["Female Pronoun"] != ""]["Female Pronoun"].count()
86
+
87
+ count_male_pronoun_profession = results_df[(results_df["Male Pronoun"] != "") & (results_df["Profession"] != "")]["Male Pronoun"].count()
88
+ count_female_pronoun_profession = results_df[(results_df["Female Pronoun"] != "") & (results_df["Profession"] != "")]["Female Pronoun"].count()
89
+
90
+ return{
91
+ "total_sentence" : count_total_sentence,
92
+ "both_gender_prof_match" : count_both_match,
93
+ "count_male_pronoun" : count_male_pronoun,
94
+ "count_female_pronoun" : count_female_pronoun,
95
+ "count_male_pronoun_profession" : count_male_pronoun_profession,
96
+ "count_female_pronoun_profession" : count_female_pronoun_profession
97
+ }
98
+
99
+ # Function to return regular expression patterns
100
+ def get_regex_pattern(male_pronoun, female_pronoun, professions):
101
+
102
+
103
+ male_pronoun_pat = r'\b({})\b'.format("|".join(male_pronoun))
104
+ female_pronoun_pat = r'\b({})\b'.format("|".join(female_pronoun))
105
+
106
+ #Lower case male professioon
107
+ professions = [prof.lower() for prof in professions]
108
+ professions_pat = r'\b({})\b'.format("|".join(professions))
109
+
110
+ return male_pronoun_pat, female_pronoun_pat, professions_pat
111
+
112
+ # Function to load sample of dataset
113
+ def load_sample(sample_first_records, sample_random_seed, sample_method, df, col_name):
114
+
115
+ # Keep only requireed column
116
+ df = df[[col_name]]
117
+ if sample_method == "first_record" and df.shape[0] > sample_first_records:
118
+ df = df.iloc[:sample_first_records].copy().reset_index()
119
+ if sample_method == "random_pick" and df.shape[0] > sample_first_records:
120
+ df = df.sample(sample_first_records, random_state=sample_random_seed).copy().reset_index()
121
+ return df
122
+
123
+ def load_dataset_and_analyze_gender_profession(df, sample_method, col_name):
124
+ # Get args from config file
125
+ sample_first_records = get_args("first_records")
126
+ sample_random_seed = get_args("random_seed")
127
+
128
+ sample_df = load_sample(sample_first_records, sample_random_seed, sample_method, df, col_name)
129
+
130
+
131
+ # Lowercase of text
132
+ sample_df[col_name] = sample_df[col_name].str.lower().str.strip()
133
+
134
+ # Call multiple threadpool
135
+ results_df = call_multiprocessing_pool(sample_df[col_name])
136
+
137
+ stats = get_statistics(results_df)
138
+
139
+ # Get statistics
140
+ return stats
scripts/gender_tagging.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import required libraries
2
+ import pandas as pd
3
+ import re
4
+ from utils.read_config import get_args
5
+
6
+ # Function to get count of male terms in text
7
+ def count_male_terms(text, male_terms):
8
+ # Get pattern
9
+ pattern = r"\b({})\b".format("|".join(male_terms))
10
+ match = re.findall(pattern, str(text))
11
+ return len(match)
12
+
13
+ # Function to get count of female terms in text
14
+ def count_female_terms(text, female_terms):
15
+ # Get pattern
16
+ pattern = r"\b({})\b".format("|".join(female_terms))
17
+ match = re.findall(pattern, str(text))
18
+ return len(match)
19
+
20
+ # Function to get gender tag categories
21
+ def get_gender_tag(count_m_term, count_f_term):
22
+ tag = ''
23
+ if count_m_term == 0 and count_f_term == 0:
24
+ tag = "No Gender"
25
+
26
+ elif count_m_term == count_f_term:
27
+ tag = "Equal Gender"
28
+
29
+ elif count_m_term > count_f_term:
30
+ m_proportion = (count_m_term / (count_m_term + count_f_term)) * 100
31
+ if m_proportion >= 50 and m_proportion < 75:
32
+ tag = "Male Positive Gender"
33
+ elif m_proportion >= 75:
34
+ tag = "Male Strongly Positive Gender"
35
+
36
+ elif count_m_term < count_f_term:
37
+ f_proportion = (count_f_term / (count_m_term + count_f_term)) * 100
38
+ if f_proportion >= 50 and f_proportion < 75:
39
+ tag = "Female Positive Gender"
40
+ elif f_proportion >= 75:
41
+ tag = "Female Strongly Positive Gender"
42
+
43
+ return tag
44
+
45
+
46
+ # Function to load sample of dataset
47
+ def load_sample(sample_first_records, sample_random_seed, sample_method, df, col_name):
48
+
49
+ # Keep only requireed column
50
+ df = df[[col_name]]
51
+ if sample_method == "first_record" and df.shape[0] > sample_first_records:
52
+ df = df.iloc[:sample_first_records].copy().reset_index()
53
+ if sample_method == "random_pick" and df.shape[0] > sample_first_records:
54
+ df = df.sample(sample_first_records, random_state=sample_random_seed).copy().reset_index()
55
+ return df
56
+
57
+ # Function to calculate PG and SPG
58
+ def get_pg_spg(sample_df):
59
+ count_no_gender_sentences = sample_df[sample_df["gender_cat"] == "No Gender"]['gender_cat'].count()
60
+
61
+ count_gender_sentences = sample_df[sample_df["gender_cat"] != "No Gender"]['gender_cat'].count()
62
+ count_equal_gender = sample_df[sample_df["gender_cat"] == "Equal Gender"]['gender_cat'].count()
63
+
64
+ count_male_pg = sample_df[sample_df['gender_cat'] == "Male Positive Gender"]['gender_cat'].count()
65
+ count_male_spg = sample_df[sample_df['gender_cat'] == "Male Strongly Positive Gender"]['gender_cat'].count()
66
+
67
+ count_female_pg = sample_df[sample_df['gender_cat'] == "Female Positive Gender"]['gender_cat'].count()
68
+ count_female_spg = sample_df[sample_df['gender_cat'] == "Female Stronly Positive Gender"]['gender_cat'].count()
69
+
70
+ return {
71
+ "gender" : count_gender_sentences,
72
+ "no gender" : count_no_gender_sentences,
73
+ "equal gender" : count_equal_gender,
74
+ "female pg" : count_female_pg,
75
+ "male pg" : count_male_pg,
76
+ "female spg" : count_female_spg,
77
+ "male spg" : count_male_spg
78
+ }
79
+
80
+ # Function to load dataset and get the analysis done
81
+ def load_dataset_and_analyze_gender_tag(df, sample_method, col_name):
82
+ # Read config file
83
+
84
+ male_terms = get_args("male_terms")
85
+ female_terms = get_args("female_terms")
86
+ sample_first_records = get_args("first_records")
87
+ sample_random_seed = get_args("random_seed")
88
+
89
+
90
+
91
+ sample_df = load_sample(sample_first_records, sample_random_seed, sample_method, df, col_name)
92
+
93
+
94
+ # Lowercase of text
95
+ sample_df[col_name] = sample_df[col_name].str.lower().str.strip()
96
+
97
+ # Get new columns of count - male terms and female terms
98
+ sample_df['count_male_term'] = sample_df.apply(lambda x : count_male_terms(x[col_name], male_terms), axis=1)
99
+ sample_df['count_female_term'] = sample_df.apply(lambda x : count_female_terms(x[:], female_terms), axis=1)
100
+
101
+ # Get tag categories
102
+ sample_df['gender_cat'] = sample_df.apply(lambda row: get_gender_tag(row['count_male_term'], row['count_female_term']), axis=1)
103
+
104
+ # Get statistics
105
+ collection = get_pg_spg(sample_df)
106
+ return collection
107
+
108
+
109
+
setup.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name='BIASAWARE',
5
+ version='0.1',
6
+ packages=find_packages(),
7
+ )
utils/config.json ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "first_records" : 200,
3
+ "random_seed" : 42,
4
+ "male_terms" : ["man", "boy", "male", "he", "son", "his", "himself", "guy", "father", "john"],
5
+ "female_terms" : ["woman", "girl", "female", "she", "daughter", "her", "herself", "gal", "mother", "mary"],
6
+ "male_pronoun" : ["he", "him", "his"],
7
+ "female_pronoun" : ["she", "her", "hers"],
8
+ "professions" : ["Accountant",
9
+ "Actor",
10
+ "Actress",
11
+ "Aerospace Engineer",
12
+ "Agricultural Scientist",
13
+ "Air Traffic Controller",
14
+ "Aircraft Mechanic",
15
+ "Animator",
16
+ "Architect",
17
+ "Art Director",
18
+ "Attorney",
19
+ "Lawyer",
20
+ "Audiologist",
21
+ "Author",
22
+ "Writer",
23
+ "Baker",
24
+ "Barber",
25
+ "Hairdresser",
26
+ "Bartender",
27
+ "Biomedical Engineer",
28
+ "Botanist",
29
+ "Broadcast Journalist",
30
+ "Business Analyst",
31
+ "Carpenter",
32
+ "Chef",
33
+ "Cook",
34
+ "Chemist",
35
+ "Civil Engineer",
36
+ "Clinical Psychologist",
37
+ "Commercial Diver",
38
+ "Computer Programmer",
39
+ "Construction Worker",
40
+ "Corporate Trainer",
41
+ "Cosmetologist",
42
+ "Counselor",
43
+ "Therapist",
44
+ "Court Reporter",
45
+ "Creative Director",
46
+ "Criminologist",
47
+ "Customer Service Representative",
48
+ "Data Analyst",
49
+ "Dental Assistant",
50
+ "Dentist",
51
+ "Dermatologist",
52
+ "Dietician",
53
+ "Nutritionist",
54
+ "Doctor",
55
+ "Physician",
56
+ "Economist",
57
+ "Electrician",
58
+ "Elementary School Teacher",
59
+ "Emergency Medical Technician",
60
+ "Engineer",
61
+ "Environmental Scientist",
62
+ "Event Planner",
63
+ "Fashion Designer",
64
+ "Film Director",
65
+ "Financial Analyst",
66
+ "Firefighter",
67
+ "Fisherman",
68
+ "Fitness Trainer",
69
+ "Flight Attendant",
70
+ "Florist",
71
+ "Food Scientist",
72
+ "Forensic Scientist",
73
+ "Furniture Maker",
74
+ "Game Developer",
75
+ "Gardener",
76
+ "Landscaper",
77
+ "Geologist",
78
+ "Graphic Designer",
79
+ "Hair Stylist",
80
+ "Historian",
81
+ "Home Health Aide",
82
+ "Hotel Manager",
83
+ "Human Resources Manager",
84
+ "Immigration Lawyer",
85
+ "Industrial Designer",
86
+ "Insurance Agent",
87
+ "Interior Designer",
88
+ "Interpreter",
89
+ "Translator",
90
+ "Investment Banker",
91
+ "IT Specialist",
92
+ "Journalist",
93
+ "Judge",
94
+ "Kindergarten Teacher",
95
+ "Land Surveyor",
96
+ "Landscape Architect",
97
+ "Lawyer",
98
+ "Attorney",
99
+ "Librarian",
100
+ "Life Coach",
101
+ "Linguist",
102
+ "Makeup Artist",
103
+ "Management Consultant",
104
+ "Manufacturing Engineer",
105
+ "Marine Biologist",
106
+ "Marketing Manager",
107
+ "Massage Therapist",
108
+ "Mechanical Engineer",
109
+ "Medical Assistant",
110
+ "Medical Researcher",
111
+ "Meteorologist",
112
+ "Midwife",
113
+ "Military Officer",
114
+ "Music Producer",
115
+ "Musician",
116
+ "Nurse",
117
+ "Occupational Therapist",
118
+ "Optician",
119
+ "Optometrist",
120
+ "Paralegal",
121
+ "Paramedic",
122
+ "Patent Attorney",
123
+ "Pediatrician",
124
+ "Personal Trainer",
125
+ "Petroleum Engineer",
126
+ "Pharmacist",
127
+ "Photographer",
128
+ "Physical Therapist",
129
+ "Physician Assistant",
130
+ "Pilot",
131
+ "Plumber",
132
+ "Police Officer",
133
+ "Political Scientist",
134
+ "Preschool Teacher",
135
+ "Private Investigator",
136
+ "Product Manager",
137
+ "Professor",
138
+ "Lecturer",
139
+ "Programmer",
140
+ "Psychiatrist",
141
+ "Psychologist",
142
+ "Public Relations Specialist",
143
+ "Public School Teacher",
144
+ "Real Estate Agent",
145
+ "Broker",
146
+ "Receptionist",
147
+ "Registered Nurse",
148
+ "Reporter",
149
+ "Restaurant Manager",
150
+ "Sales Representative",
151
+ "School Counselor",
152
+ "Scientist",
153
+ "Screenwriter",
154
+ "Social Media Manager",
155
+ "Social Worker",
156
+ "Software Developer",
157
+ "Speech-Language Pathologist",
158
+ "Sports Coach",
159
+ "Statistician"]
160
+ }
utils/read_config.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ def read_config_file():
4
+ with open("utils/config.json", "r") as jsonfile:
5
+ data = json.load(jsonfile)
6
+ return data
7
+
8
+ def get_args(args):
9
+ try:
10
+ data = read_config_file()
11
+ except:
12
+ raise "Could not read config file."
13
+ return data[args]