Maximofn commited on
Commit
1e1be2d
1 Parent(s): 7994131

transcribe with whisperx

Browse files
Files changed (1) hide show
  1. transcribe.py +44 -12
transcribe.py CHANGED
@@ -1,20 +1,46 @@
1
  import os
2
  import argparse
 
3
 
4
- def transcribe(audio_file, language):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  output_folder = "transcriptions"
6
 
7
  # Transcribe audio file
8
  model = "large-v2"
9
- word_timestamps = True
 
 
10
  fp16 = False
11
- device = "cuda"
12
  verbose = False
 
 
13
  threads = 4
14
  output_format = "srt"
15
- command = f'whisper --model {model} --output_dir {output_folder} --language {language} \
16
- --word_timestamps {word_timestamps} --fp16 {fp16} --device {device} --verbose {verbose} \
17
- --threads {threads} --output_format {output_format} {audio_file}'
 
 
 
18
  os.system(command)
19
 
20
  if __name__ == "__main__":
@@ -22,10 +48,10 @@ if __name__ == "__main__":
22
  parser.add_argument('input_files', help='Input audio files')
23
  parser.add_argument('language', help='Language of the audio file')
24
  parser.add_argument('speakers_file', help='File with the number of speakers')
 
25
  args = parser.parse_args()
26
 
27
  vocals_folder = "vocals"
28
- extension = "wav"
29
 
30
  with open(args.speakers_file, 'r') as f:
31
  speakers = f.read().splitlines()
@@ -34,8 +60,14 @@ if __name__ == "__main__":
34
  with open(args.input_files, 'r') as f:
35
  inputs = f.read().splitlines()
36
  for input in inputs:
37
- input, _ = input.split('.')
38
- _, input_name = input.split('/')
39
- for i in range(speakers):
40
- file = f'{vocals_folder}/{input_name}_speaker{i:003d}.{extension}'
41
- transcribe(file, args.language)
 
 
 
 
 
 
 
1
  import os
2
  import argparse
3
+ from lang_list import LANGUAGE_NAME_TO_CODE, WHISPER_LANGUAGES
4
 
5
+ # For pyannote.audio diarize
6
+ from pyannote.audio import Model
7
+ model = Model.from_pretrained("pyannote/segmentation-3.0", use_auth_token="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")
8
+
9
+ language_dict = {}
10
+ # Iterate over the LANGUAGE_NAME_TO_CODE dictionary
11
+ for language_name, language_code in LANGUAGE_NAME_TO_CODE.items():
12
+ # Extract the language code (the first two characters before the underscore)
13
+ lang_code = language_code.split('_')[0].lower()
14
+
15
+ # Check if the language code is present in WHISPER_LANGUAGES
16
+ if lang_code in WHISPER_LANGUAGES:
17
+ # Construct the entry for the resulting dictionary
18
+ language_dict[language_name] = {
19
+ "transcriber": lang_code,
20
+ "translator": language_code
21
+ }
22
+
23
+ def transcribe(audio_file, language, device):
24
  output_folder = "transcriptions"
25
 
26
  # Transcribe audio file
27
  model = "large-v2"
28
+ # word_timestamps = True
29
+ print_progress = True
30
+ compute_type = "float32"
31
  fp16 = False
32
+ batch_size = 8
33
  verbose = False
34
+ min_speakers = 1
35
+ max_speakers = 10
36
  threads = 4
37
  output_format = "srt"
38
+ hf_token = "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"
39
+ command = f'whisperx {audio_file} --model {model} --batch_size {batch_size} --compute_type {compute_type} \
40
+ --output_dir {output_folder} --output_format {output_format} --verbose {verbose} --language {language} \
41
+ --fp16 {fp16} --threads {threads} --print_progress {print_progress} --min_speakers {min_speakers} \
42
+ --max_speakers {max_speakers} --diarize --hf_token {hf_token}'
43
+ # --diarize'
44
  os.system(command)
45
 
46
  if __name__ == "__main__":
 
48
  parser.add_argument('input_files', help='Input audio files')
49
  parser.add_argument('language', help='Language of the audio file')
50
  parser.add_argument('speakers_file', help='File with the number of speakers')
51
+ parser.add_argument('device', help='Device to use for PyTorch inference')
52
  args = parser.parse_args()
53
 
54
  vocals_folder = "vocals"
 
55
 
56
  with open(args.speakers_file, 'r') as f:
57
  speakers = f.read().splitlines()
 
60
  with open(args.input_files, 'r') as f:
61
  inputs = f.read().splitlines()
62
  for input in inputs:
63
+ input_file, _ = input.split('.')
64
+ _, input_name = input_file.split('/')
65
+ if speakers > 0:
66
+ extension = "wav"
67
+ for i in range(speakers):
68
+ file = f'{vocals_folder}/{input_name}_speaker{i:003d}.{extension}'
69
+ transcribe(file, language_dict[args.language]["transcriber"], args.device)
70
+ else:
71
+ extension = "mp3"
72
+ file = f'{vocals_folder}/{input_name}.{extension}'
73
+ transcribe(file, language_dict[args.language]["transcriber"], args.device)