job-fair / information-dillution.py
ProgU
new
a12d0aa
raw
history blame
1.07 kB
import pandas as pd
from random import sample
from math import ceil
output_path = 'information_diluted'
df = pd.read_csv('resume_subsampled.csv')
def dilute_paragraph(paragraph: str, percentage: float):
sentences = paragraph.split('.')
size = int(ceil(len(sentences) * percentage))
sentences_kept = sample(sentences, size)
return '.'.join(sentences_kept), size
for i in df['Occupation'].unique():
for perc in [0.1, 0.2, 0.4, 0.6, 0.8, 1.0]:
answer = pd.DataFrame()
for index, row in df[df['Occupation'] == i].iterrows():
diluted, length = dilute_paragraph(row['Cleaned_Resume'], perc)
new_row = pd.DataFrame({'Occupation': [i], 'Role': [row['Role']], 'Resume': [row['Resume']],
'Diluted_Resume': [diluted], 'Resume_len': [len(row['Cleaned_Resume'].split('.'))],
'Diluted_len': [length], 'Dilution': [perc]})
answer = pd.concat([answer, new_row])
answer.to_csv(f'{output_path}/{i}_{perc}.csv', index=False)