rosyvs commited on
Commit
e5eb577
1 Parent(s): 6d504a5

remove unnecessary files, add preproccessor_config.json

Browse files
__init__.py DELETED
File without changes
__pycache__/__init__.cpython-310.pyc DELETED
Binary file (128 Bytes)
 
__pycache__/benchmark_utils.cpython-310.pyc DELETED
Binary file (10.2 kB)
 
__pycache__/converters.cpython-310.pyc DELETED
Binary file (5.32 kB)
 
__pycache__/renamers.cpython-310.pyc DELETED
Binary file (1.61 kB)
 
__pycache__/trimmers.cpython-310.pyc DELETED
Binary file (2.77 kB)
 
converters.py DELETED
@@ -1,206 +0,0 @@
1
- import os
2
- import csv
3
- import re
4
- import pandas as pd
5
- from pathlib import Path
6
- import numpy as np
7
- # functions to convert between different transcript/annotation formats
8
-
9
- #######
10
- # "table" refers to a pd.Dataframe w the following cols
11
- # [uttID, speaker, transcript, start_sec, end_sec]
12
- #########
13
-
14
- # separate function to write to csv, tsv or ELAN compatible (ELAN interprets ALL commas as delimiter so we need to use tab instead)
15
-
16
- def HHMMSS_to_sec(time_str):
17
- """Get Seconds from timestamp string with milliseconds."""
18
- if not time_str:
19
- return None
20
- if time_str.count(':')==2:
21
- h, m, s = time_str.split(':')
22
- elif time_str.count(':')==3:
23
- # weird timestamps where there is a field followign seconds delimited by colon
24
- h, m, s, u = time_str.split(':')
25
- # determine whether ms field is in tenths or hundredths or thousandths by countng how many digits
26
- if len(u)==1:
27
- print('Weird time format detected - HH:MM:SS:tenths - please verify this is how you want the time interpreted')
28
- ms = float(u)/10
29
- elif len(u)==2: # hundredths
30
- ms = float(u)/100
31
- elif len(u)==3: # hundredths
32
- ms = float(u)/1000
33
- else:
34
- print(f'input string format not supported: {time_str}')
35
- return None
36
- s = int(s)+ms
37
- elif time_str.count(':')==1:
38
- # print('missing HH from timestamp, assuming MM:SS')
39
- m, s = time_str.split(':')
40
- h=0
41
- elif time_str.count(':')==0 and time_str.count('.')==1:
42
- # print('missing HH:MM from timestamp, assuming SS.ms')
43
- s = float(time_str)
44
- h=0
45
- m=0
46
- else:
47
- print(f'input string format not supported: {time_str}')
48
- return None
49
- return int(h) * 3600 + int(m) * 60 + float(s)
50
-
51
- def sec_to_timecode(time_sec):
52
- # convert seconds to HH:MM:SS:hundredths as used in .xlsx transcripts
53
- h=int(time_sec//3600)
54
- m=int((time_sec-3600*h)//60)
55
- s=int(time_sec-3600*h-60*m)
56
- u=round(100*(time_sec-3600*h-60*m-s))
57
- timecode=f'{h}:{m:02}:{s:02}:{u:02}'
58
- return(timecode)
59
-
60
- def docx_scraped_tsv_to_table(ooona_file):
61
- # ooona output is a table in a word docx,
62
- # for now manually copying this out and saving as tsv
63
- # but the timestamp format is wrong
64
- # input cols are SHOT START END SPEAKER DIALOGUE
65
-
66
- with open(ooona_file) as in_file:
67
- reader = csv.reader(in_file, delimiter="\t")
68
- next(reader) # skip header
69
- rows=[]
70
- for i,line in enumerate(reader):
71
- utt_ix, start_time, end_time, speaker, transcript = line
72
- start_sec = HHMMSS_to_sec(start_time)
73
- end_sec = HHMMSS_to_sec(end_time)
74
- rows.append([utt_ix,speaker,transcript,start_sec,end_sec])
75
- utt_table = pd.DataFrame(rows, columns=['uttID','speaker','transcript','start_sec','end_sec'])
76
- return(utt_table)
77
- # table = pd.read_csv(ooona_file, sep='\t')
78
-
79
- def molly_xlsx_to_table(xl_file):
80
- # contractor transcribers provide an xlsx with the following columns
81
- # utt_ix: int
82
- # Timecode: "HH:MM:SS:ss - HH:MM:SS:ss"
83
- # Duration: HH:MM:SS:ss
84
- # Speaker: str
85
- # Dialogue: str
86
- # Annotations: blank
87
- # Error Type: blank
88
- with pd.ExcelFile(xl_file) as xls:
89
- sheetname = xls.sheet_names
90
- table = pd.DataFrame(pd.read_excel(xls, sheetname[0]))
91
- table.columns=table.columns.str.lower()
92
- table[['start_time','end_time']] = table['timecode'].str.split('-',expand=True)
93
- table['start_sec'] = table['start_time'].str.strip().apply(HHMMSS_to_sec)
94
- table['end_sec'] = table['end_time'].str.strip().apply(HHMMSS_to_sec)
95
- table.drop(labels=['annotations','error type','duration'], axis=1, inplace=True)
96
- table=table[['#','speaker','dialogue','start_sec','end_sec']]
97
- table.rename(columns={'#':'uttID', 'dialogue':'transcript'}, inplace=True)
98
- table.reset_index(inplace=True,drop=True)
99
- table=table.replace('', np.nan).dropna(subset=['speaker','dialogue'], how='all') # drop rows with missing values in speaker and utterance
100
- return table
101
-
102
- def LoFi_xlsx_to_table(xl_file):
103
- # LoFi transcripts have the following columns:
104
- # # utt_ix: int
105
- # Timecode: "HH:MM:SS:ss - HH:MM:SS:ss"
106
- # Duration: HH:MM:SS:ss
107
- # Speaker: str
108
- # Dialogue: str
109
- # Annotations: blank
110
- # Error Type: blank
111
- with pd.ExcelFile(xl_file) as xls:
112
- sheetname = xls.sheet_names
113
- table = pd.DataFrame(pd.read_excel(xls, sheetname[0]))
114
- table[['start_time','end_time']] = table['Timecode'].str.split('-',expand=True)
115
- table['start_sec'] = table['start_time'].str.strip().apply(HHMMSS_to_sec)
116
- table['end_sec'] = table['end_time'].str.strip().apply(HHMMSS_to_sec)
117
- table.drop(labels=['Annotations','Error Type','Duration'], axis=1, inplace=True)
118
- table=table[['#','Speaker','Dialogue','start_sec','end_sec']]
119
- table.rename(columns={'#':'uttID','Speaker':'speaker', 'Dialogue':'transcript'}, inplace=True)
120
-
121
- return table
122
-
123
- def saga_to_table(saga_txt):
124
- # saga's own transcripts are txt given in the following format
125
- #
126
- # speaker (start time MM:SS)
127
- # utterance
128
- # <blank line>
129
- # TODO: make more robust by pattern matching instead of modulo
130
- with open(saga_txt) as in_file:
131
- reader = csv.reader(in_file, delimiter="\n")
132
- count = 0
133
- rows=[]
134
- for i,line in enumerate(reader):
135
- print((count,line))
136
- if count%3 == 0:
137
- # utt = utt.split('\n') # now speaker (time) , transcript
138
- # transcript = utt[1]
139
- spk_time = line[0].split('(')
140
- if len(spk_time)<2:
141
- # print('!!!speaker not changed')
142
- # print(line)
143
- timestamp = spk_time[0].strip('):( ')
144
- speaker=rows[-1][0] # prev speaker
145
-
146
- else:
147
- speaker = spk_time[0]
148
- timestamp = spk_time[1].replace('):','')
149
- # print(timestamp)
150
- start_sec = HHMMSS_to_sec(timestamp)
151
-
152
- if count%3 == 1:
153
- transcript = line[0]
154
- if count%3 == 2:
155
- rows.append([i,speaker,transcript,start_sec,None])
156
- #print([speaker,transcript,timestamp])
157
- count+=1
158
- utt_table = pd.DataFrame(rows, columns=['uttID','speaker','transcript','start_sec','end_sec'])
159
- return(utt_table)
160
-
161
- def table_to_ELAN_tsv(table:pd.DataFrame, path:str):
162
- # write table to tsv compatible with ELAN import
163
- table.to_csv(path, index=False, float_format='%.3f',sep='\t')
164
-
165
- def table_to_standard_csv(table:pd.DataFrame, path:str):
166
- # write table to standard csv format agreed upon by whole team
167
-
168
- # TODO: convert times in seconds back to HH:MM:SS?
169
- # TODO: split utterances into sentences?
170
- table.to_csv(path,index=False, float_format='%.3f')
171
-
172
- def table_to_utt_labels_csv(table:pd.DataFrame, path:str):
173
- # write table to utt_labels csv format comaptable w rosy's isatasr lib
174
- table.rename(columns={'transcript':'utterance', 'uttID':'seg'}, inplace=True)
175
- table=table.replace('', np.nan).dropna(subset=['speaker','utterance'], how='all') # drop rows with missing values in speaker and utterance
176
- table.to_csv(path,index=False, float_format='%.3f')
177
-
178
- def table_to_molly_xlsx(tbl:pd.DataFrame,path:str):
179
- tblx = tbl
180
- tblx.rename(columns={'uttID':'#', 'speaker':'Speaker','transcript':'Dialogue'}, inplace=True)
181
- tblx['dur_s'] = tblx['end_sec']-tblx['start_sec']
182
- tblx['start_timecode']=tblx['start_sec'].apply(sec_to_timecode)
183
- tblx['end_timecode']=tblx['end_sec'].apply(sec_to_timecode)
184
- tblx['Duration'] = tblx['dur_s'].apply(sec_to_timecode)
185
- tblx['Timecode'] = [' - '.join(i) for i in zip(tblx['start_timecode'], tblx['end_timecode'])]
186
- tblx['Annotations'] = ''
187
- tblx['Error Type'] = ''
188
- tblx=tblx[['#','Timecode','Duration','Speaker','Dialogue','Annotations','Error Type']]
189
- tblx.to_excel(path,sheet_name=Path(path).stem, index=False)
190
-
191
- def utt_labels_csv_to_table(label_csv:str):
192
- # utt_labels_csv is the usual format used for diarized, timed transcripts in this repo
193
- # There are several versions with differnt columns (with/without segment &/ utterance index)
194
- # table:
195
- # [uttID, speaker, transcript, start_sec, end_sec]
196
-
197
- table = pd.read_csv(label_csv,keep_default_na=False)
198
- # choose which column to use for uttID in table
199
- if 'utt' in table.columns:
200
- table=table.rename(columns={"utt":"uttID"}).drop('seg', axis=1)
201
- elif 'seg' in table.columns:
202
- table=table.rename(columns={"seg":"uttID"})
203
- else:
204
- table=table.reset_index().rename(columns={"index":"uttID"})
205
-
206
- return table
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
preprocessor_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length": 30,
3
+ "feature_extractor_type": "WhisperFeatureExtractor",
4
+ "feature_size": 80,
5
+ "hop_length": 160,
6
+ "n_fft": 400,
7
+ "n_samples": 480000,
8
+ "nb_max_frames": 3000,
9
+ "padding_side": "right",
10
+ "padding_value": 0.0,
11
+ "processor_class": "WhisperProcessor",
12
+ "return_attention_mask": false,
13
+ "sampling_rate": 16000
14
+ }
renamers.py DELETED
@@ -1,77 +0,0 @@
1
- import csv
2
- import os
3
- import glob
4
- import shutil
5
- import re
6
-
7
-
8
- # rename files from original filename (hexadecimal salad) to Session_ID (human readable) and back
9
- global DEFAULT_MAP_PATH
10
- DEFAULT_MAP_PATH = '../../SessionIDs_from_catalog.csv'
11
-
12
- def make_SessionID_map(path=DEFAULT_MAP_PATH):
13
- """generate dictionary from csv file with columns for File_Name and Session_ID -
14
- copied from columsn 1 & 2 of the Catalog on OneDrive
15
- """
16
- SID_to_FN={}
17
- FN_to_SID={}
18
- with open(path,encoding='utf-8-sig') as f:
19
- reader = csv.reader(f)
20
- headers = next(reader)
21
- assert (headers[0]=='File_Name' or headers[0]=='Conference_ID') & (headers[1]=='Session_ID'), "Headers are wrong, expected ('File_Name' or 'Conference_ID') and 'Session_ID'"
22
-
23
- for line in reader:
24
- filename,sessionID=line
25
- filename=filename.split('.')[0] # remove extensions
26
- if (len(filename.strip())>0 and len(sessionID.strip())>0):
27
- SID_to_FN[sessionID]=filename
28
- FN_to_SID[filename]=sessionID
29
- return(SID_to_FN, FN_to_SID)
30
-
31
-
32
- def rename_files_SID_to_FN(path, recursive=True, overwrite=False):
33
- SID_to_FN, _=make_SessionID_map()
34
- #TODO: deal with matching nested sIDs, see commented code below
35
- newpaths=[]
36
- for sID in SID_to_FN.keys():
37
- srclist = glob.glob(os.path.join(path,'**', f'*{sID}.*'), recursive=recursive)
38
- # print(f'siD: {sID}')
39
- # print(srclist)
40
- for srcpath in srclist:
41
- newpath = srcpath.replace(sID, SID_to_FN[sID])
42
- print(newpath)
43
- if overwrite==True:
44
- shutil.move(srcpath, newpath)
45
- else:
46
- shutil.copy(srcpath, newpath)
47
- newpaths.append(newpath)
48
- return newpaths
49
-
50
-
51
- # # get sessnames
52
- # sesslist = [s for s in os.listdir(path) ]
53
- # srclist = [os.path.join(src_dir, filename) for filename in os.listdir(src_dir) if os.path.isfile(os.path.join(src_dir, filename))]
54
- # for src in srclist:
55
- # sessname_matches = [sessname in src for sessname in sesslist]
56
- # if sum(sessname_matches)>1:
57
- # print('!!!! multiple matches, will take longest match. TODO: implement this you dope')
58
- # elif not any(sessname_matches):
59
- # print(f'!!!! no sessname matches for file {src}')
60
- # else:
61
- # sessname = sesslist[sessname_matches.index(True)]
62
- # print(f'...copying to {sessname}')
63
- # shutil.copy(src, os.path.join(dest_dir,sessname))
64
-
65
- def rename_files_FN_to_SID(path, recursive=True):
66
- _, FN_to_SID=make_SessionID_map()
67
-
68
- def extract_conferenceID_from_filename(filename):
69
- """extract conferenceID from filename
70
- """
71
- conferenceID=filename.split(' ')[0]
72
- conferenceID = re.sub('_?[a-zA-Z]*(\.*[a-zA-Z]*).xlsx','', conferenceID)
73
- conferenceID=re.sub('TMcoded|Transcript','', conferenceID)
74
- conferenceID=re.sub('_start\d+_end\d+_?','', conferenceID)
75
- conferenceID=re.sub(
76
- '\d{5}_\d{4}-\d{2}-\d{2}_','', conferenceID)
77
- return conferenceID
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
trimmers.py DELETED
@@ -1,139 +0,0 @@
1
- from pathlib import Path
2
- import os
3
- import csv
4
- import subprocess
5
- import pandas as pd
6
- import sys
7
- sys.path.append('..')
8
-
9
- from levi.converters import HHMMSS_to_sec
10
-
11
- def trim_media(media_in,
12
- media_out,
13
- start,
14
- end):
15
-
16
- # options for writing out audio if converting
17
- WAV_CHANNELS = 1
18
- WAV_SAMPLE_RATE = 16000
19
-
20
- media_type = Path(media_in).suffix
21
- ext = Path(media_out).suffix
22
-
23
- if isinstance(start, str):
24
- start_sec = HHMMSS_to_sec(start)
25
- else:
26
- start_sec = float(start)
27
- if isinstance(end, str):
28
- end_sec = HHMMSS_to_sec(end)
29
- else:
30
- end_sec = float(end)
31
-
32
- if ext == '.wav':
33
- # convert to wav with standard format for audio models
34
- print(f'...Using ffmpeg to trim video from {start} to {end} \n and convert to {WAV_SAMPLE_RATE}Hz WAV with {WAV_CHANNELS} channels...')
35
- print(f'...generating {media_out}...')
36
-
37
- subprocess.call(['ffmpeg',
38
- '-y',
39
- '-i',
40
- media_in,
41
- '-ss',
42
- f'{start_sec}',
43
- '-to',
44
- f'{end_sec}',
45
- '-acodec',
46
- 'pcm_s16le',
47
- '-ac',
48
- WAV_CHANNELS,
49
- '-ar',
50
- WAV_SAMPLE_RATE,
51
- media_out,
52
- '-hide_banner',
53
- '-loglevel',
54
- 'warning'
55
- ],shell=False)
56
-
57
- else:
58
-
59
- print(f'...Using ffmpeg to trim video from {start_sec} to {end_sec}...')
60
- print(f'...generating {media_out}...')
61
-
62
- subprocess.call(['ffmpeg',
63
- '-y',
64
- '-i',
65
- media_in,
66
- '-ss',
67
- f'{start_sec}',
68
- '-to',
69
- f'{end_sec}',
70
- '-c',
71
- 'copy',
72
- media_out,
73
- '-hide_banner',
74
- '-loglevel',
75
- 'warning'
76
- ],shell=False)
77
-
78
- def trim_media_batch(extract_timings_csv,
79
- outpath,
80
- suffix='',
81
- convert_to=False):
82
- """trim a batch of media files given a csv of timings
83
-
84
- Args:
85
- extract_timings_csv (str): path to csv with columns:
86
- filepath, start (HH:MM:SS), end (HH:MM:SS)
87
- outpath (str): output path
88
- suffix (str, optional): save output trimmed files with this suffix. Defaults to ''.
89
- convert_to (bool, optional): [None, 'wav','mp4']. Defaults to False.
90
- Returns:
91
- outfiles (list): list of file paths created
92
- """
93
-
94
-
95
-
96
- os.makedirs(outpath, exist_ok=True)
97
-
98
- samples_df = pd.read_csv(
99
- extract_timings_csv,
100
- skip_blank_lines=True,
101
- index_col=False,
102
- names=['media_in','startHMS','endHMS'],
103
- header=0
104
- ).dropna().sort_values(
105
- by='media_in',ignore_index=True).reset_index(drop=True)
106
-
107
- print(f'TRIMMING {len(samples_df.index)} FILES...')
108
-
109
- # enumerate samples by session and check if there are multiple samples from a given session
110
- samples_df['count'] = samples_df.groupby('media_in').cumcount()
111
- if not os.path.exists(outpath):
112
- os.makedirs(outpath)
113
-
114
- outfiles=[]
115
- for i, rec in samples_df.iterrows():
116
- media_in,startHMS,endHMS, count = rec.values
117
- suffix_use = f'{suffix}{count}' if count > 0 else suffix # if multiple samples per recording, give a diffrent name
118
-
119
- if not os.path.exists(media_in):
120
- print(f'!!!WARNING: media not found: {media_in}')
121
- continue
122
-
123
- media_type = Path(media_in).suffix
124
- sessname = Path(media_in).stem
125
- print(f'...Input media: {media_in}')
126
-
127
- if convert_to=='wav':
128
- ext = '.wav'
129
- elif convert_to=='mp4':
130
- ext = '.mp4'
131
- else:
132
- ext = media_type
133
-
134
- outfile = os.path.expanduser(os.path.join(outpath,f'{sessname}{suffix_use}{ext}'))
135
-
136
- trim_media(media_in, outfile, HHMMSS_to_sec(startHMS), HHMMSS_to_sec(endHMS))
137
-
138
- outfiles.append(outfile)
139
- return(outfiles)