ProgU commited on
Commit
a12d0aa
1 Parent(s): 5e310c5
information-dillution.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from random import sample
3
+ from math import ceil
4
+
5
+ output_path = 'information_diluted'
6
+ df = pd.read_csv('resume_subsampled.csv')
7
+
8
+
9
+ def dilute_paragraph(paragraph: str, percentage: float):
10
+ sentences = paragraph.split('.')
11
+ size = int(ceil(len(sentences) * percentage))
12
+ sentences_kept = sample(sentences, size)
13
+ return '.'.join(sentences_kept), size
14
+
15
+
16
+ for i in df['Occupation'].unique():
17
+ for perc in [0.1, 0.2, 0.4, 0.6, 0.8, 1.0]:
18
+ answer = pd.DataFrame()
19
+ for index, row in df[df['Occupation'] == i].iterrows():
20
+ diluted, length = dilute_paragraph(row['Cleaned_Resume'], perc)
21
+ new_row = pd.DataFrame({'Occupation': [i], 'Role': [row['Role']], 'Resume': [row['Resume']],
22
+ 'Diluted_Resume': [diluted], 'Resume_len': [len(row['Cleaned_Resume'].split('.'))],
23
+ 'Diluted_len': [length], 'Dilution': [perc]})
24
+ answer = pd.concat([answer, new_row])
25
+ answer.to_csv(f'{output_path}/{i}_{perc}.csv', index=False)
26
+
27
+
information_diluted/CONSTRUCTION_0.1.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56a49599b29457510af23d31def1f941ca684997c721639e1bc5942cca8c117e
3
+ size 697818
information_diluted/CONSTRUCTION_0.2.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a244a351fdc16a3abd63fc669ab015dccaa977fa663f47b32c628dabfea76786
3
+ size 765594
information_diluted/CONSTRUCTION_0.4.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1561307d529b43f475dd7aad1fa4a256927d75f14d4775a8c5c8149f11545f12
3
+ size 884289
information_diluted/CONSTRUCTION_0.6.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f93ae691f8d4b97e0ea11ea0c8bb35b50983a594e595fdc53a2bb845174c513
3
+ size 1001689
information_diluted/CONSTRUCTION_0.8.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2054eab7ef54b5558b799cd6ea855d33995d8f3b31ec884da4fa63555226aabd
3
+ size 1132853
information_diluted/CONSTRUCTION_1.0.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21f99cfee60552804ac833a98cc226842a3f781cbb72846a4870e0c5263daa15
3
+ size 1254210
information_diluted/FINANCE_0.1.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:165f3fb51af8f06e767d6f9736fbc7d4a2ef9b5f95fc64ceccee3e4f8971eb14
3
+ size 694242
information_diluted/FINANCE_0.2.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4938d85c91449ec872af6e81c267691c55a165f202307256db9f59382e0d343
3
+ size 761615
information_diluted/FINANCE_0.4.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fea9794553ce7e503c9047218dcc29f4ce39d287ac06dfc8691ad9ae607c855
3
+ size 883572
information_diluted/FINANCE_0.6.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0b725da492391c5ad490da7ef64efcecc32a3e1d4e34bcb5924eedb6cfcc4e8
3
+ size 996213
information_diluted/FINANCE_0.8.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c99881d366677e7f1645b1bbb2dab35d8ee30720017f33743a5a7724fe5f3c2
3
+ size 1129399
information_diluted/FINANCE_1.0.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b83e633f6a598d6e957dd66dc4541aeea9f3d6706abe70b988d7a8c24847728b
3
+ size 1242126
information_diluted/HEALTHCARE_0.1.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4de3ec53856cac5f9e27433b78202cab3ef278c7220d55649d2a5410af73b24c
3
+ size 757404
information_diluted/HEALTHCARE_0.2.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:385717583856adbd5b3d0ad03085871cc2f567c4daffbea9c98b1ea4a261cba6
3
+ size 824734
information_diluted/HEALTHCARE_0.4.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cc9ceb5834995a97f1192ffd00e68fb9e5ef2a532e6f1713aa9a1b55abcdeee
3
+ size 965444
information_diluted/HEALTHCARE_0.6.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb11c614547e1c1c56c07c1fc0c2b242211b4b5d49b8073fca470a7583603a52
3
+ size 1083102
information_diluted/HEALTHCARE_0.8.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4b7098e27e53e06ef0e23b495f03ae8e1a027a57eb8c16c8c2b89999fd4155d
3
+ size 1223547
information_diluted/HEALTHCARE_1.0.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11054cd592d47ec7abfc8253858a5d7cc969af3f1c93fdca5340933c1817c8f2
3
+ size 1350566