Sabbah13 commited on
Commit
50e81bb
1 Parent(s): b688722

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +73 -0
utils.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def convert_segments_object_to_text(data):
2
+ result = []
3
+
4
+ for segment in data['segments']:
5
+ words = segment['words']
6
+ segment_speaker = segment.get('speaker', None)
7
+ segment_start = segment.get('start', None)
8
+ segment_end = segment.get('end', None)
9
+ current_speaker = None
10
+ current_start = None
11
+ current_end = None
12
+ current_text = []
13
+
14
+ # Forward fill speaker, start and end if missing
15
+ for i, word_info in enumerate(words):
16
+ if 'speaker' not in word_info:
17
+ if i > 0 and 'speaker' in words[i - 1]:
18
+ word_info['speaker'] = words[i - 1]['speaker']
19
+ elif i < len(words) - 1 and 'speaker' in words[i + 1]:
20
+ word_info['speaker'] = words[i + 1]['speaker']
21
+ else:
22
+ word_info['speaker'] = segment_speaker
23
+
24
+ if 'start' not in word_info:
25
+ if i > 0 and 'end' in words[i - 1]:
26
+ word_info['start'] = words[i - 1]['end']
27
+ else:
28
+ word_info['start'] = segment_start
29
+
30
+ if 'end' not in word_info:
31
+ if i < len(words) - 1 and 'start' in words[i + 1]:
32
+ word_info['end'] = words[i + 1]['start']
33
+ elif i == len(words) - 1:
34
+ word_info['end'] = segment_end
35
+ else:
36
+ word_info['end'] = word_info['start']
37
+
38
+ for word_info in words:
39
+ word = word_info.get('word', '')
40
+ start = word_info.get('start', None)
41
+ end = word_info.get('end', None)
42
+ speaker = word_info.get('speaker', None)
43
+
44
+ if current_speaker is None:
45
+ current_speaker = speaker
46
+ current_start = start
47
+
48
+ if speaker == current_speaker:
49
+ current_text.append(word)
50
+ current_end = end
51
+ else:
52
+ # Finish current segment
53
+ if current_start is not None and current_end is not None:
54
+ formatted_text = f'{current_speaker} ({current_start} : {current_end}) : {" ".join(current_text)}'
55
+ else:
56
+ formatted_text = f'{current_speaker} : {" ".join(current_text)}'
57
+ result.append(formatted_text)
58
+
59
+ # Start new segment
60
+ current_speaker = speaker
61
+ current_start = start
62
+ current_end = end
63
+ current_text = [word]
64
+
65
+ # Append the last segment
66
+ if current_text:
67
+ if current_start is not None and current_end is not None:
68
+ formatted_text = f'{current_speaker} ({current_start} : {current_end}) : {" ".join(current_text)}'
69
+ else:
70
+ formatted_text = f'{current_speaker} : {" ".join(current_text)}'
71
+ result.append(formatted_text)
72
+
73
+ return '\n'.join(result)