Spaces:
Sleeping
Sleeping
import pandas as pd | |
from random import sample | |
from math import ceil | |
output_path = 'information_diluted' | |
df = pd.read_csv('resume_subsampled.csv') | |
def dilute_paragraph(paragraph: str, percentage: float): | |
sentences = paragraph.split('.') | |
size = int(ceil(len(sentences) * percentage)) | |
sentences_kept = sample(sentences, size) | |
return '.'.join(sentences_kept), size | |
for i in df['Occupation'].unique(): | |
for perc in [0.1, 0.2, 0.4, 0.6, 0.8, 1.0]: | |
answer = pd.DataFrame() | |
for index, row in df[df['Occupation'] == i].iterrows(): | |
diluted, length = dilute_paragraph(row['Cleaned_Resume'], perc) | |
new_row = pd.DataFrame({'Occupation': [i], 'Role': [row['Role']], 'Resume': [row['Resume']], | |
'Diluted_Resume': [diluted], 'Resume_len': [len(row['Cleaned_Resume'].split('.'))], | |
'Diluted_len': [length], 'Dilution': [perc]}) | |
answer = pd.concat([answer, new_row]) | |
answer.to_csv(f'{output_path}/{i}_{perc}.csv', index=False) | |