Maximofn commited on
Commit
b004aea
1 Parent(s): 830ffb4

Change transcription to fp16, and read if vocals separation is made into another script, if yes it doesn't separe vocals, if no it separe vocals

Browse files
Files changed (1) hide show
  1. transcribe.py +9 -8
transcribe.py CHANGED
@@ -20,15 +20,15 @@ for language_name, language_code in LANGUAGE_NAME_TO_CODE.items():
20
  "translator": language_code
21
  }
22
 
23
- def transcribe(audio_file, language, device):
24
  output_folder = "transcriptions"
25
 
26
  # Transcribe audio file
27
  model = "large-v2"
28
  # word_timestamps = True
29
  print_progress = True
30
- compute_type = "float32"
31
- fp16 = False
32
  batch_size = 8
33
  verbose = False
34
  min_speakers = 1
@@ -38,9 +38,9 @@ def transcribe(audio_file, language, device):
38
  hf_token = "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"
39
  command = f'whisperx {audio_file} --model {model} --batch_size {batch_size} --compute_type {compute_type} \
40
  --output_dir {output_folder} --output_format {output_format} --verbose {verbose} --language {language} \
41
- --fp16 {fp16} --threads {threads} --print_progress {print_progress} --min_speakers {min_speakers} \
42
- --max_speakers {max_speakers} --diarize --hf_token {hf_token}'
43
- # --diarize'
44
  os.system(command)
45
 
46
  if __name__ == "__main__":
@@ -49,6 +49,7 @@ if __name__ == "__main__":
49
  parser.add_argument('language', help='Language of the audio file')
50
  parser.add_argument('speakers_file', help='File with the number of speakers')
51
  parser.add_argument('device', help='Device to use for PyTorch inference')
 
52
  args = parser.parse_args()
53
 
54
  vocals_folder = "vocals"
@@ -66,8 +67,8 @@ if __name__ == "__main__":
66
  extension = "wav"
67
  for i in range(speakers):
68
  file = f'{vocals_folder}/{input_name}_speaker{i:003d}.{extension}'
69
- transcribe(file, language_dict[args.language]["transcriber"], args.device)
70
  else:
71
  extension = "mp3"
72
  file = f'{vocals_folder}/{input_name}.{extension}'
73
- transcribe(file, language_dict[args.language]["transcriber"], args.device)
 
20
  "translator": language_code
21
  }
22
 
23
+ def transcribe(audio_file, language, device, vocals):
24
  output_folder = "transcriptions"
25
 
26
  # Transcribe audio file
27
  model = "large-v2"
28
  # word_timestamps = True
29
  print_progress = True
30
+ compute_type = "float16"
31
+ fp16 = True
32
  batch_size = 8
33
  verbose = False
34
  min_speakers = 1
 
38
  hf_token = "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"
39
  command = f'whisperx {audio_file} --model {model} --batch_size {batch_size} --compute_type {compute_type} \
40
  --output_dir {output_folder} --output_format {output_format} --verbose {verbose} --language {language} \
41
+ --fp16 {fp16} --threads {threads} --print_progress {print_progress} --device {device}'
42
+ if vocals:
43
+ command += f' --diarize --max_speakers {max_speakers} --min_speakers {min_speakers} --hf_token {hf_token}'
44
  os.system(command)
45
 
46
  if __name__ == "__main__":
 
49
  parser.add_argument('language', help='Language of the audio file')
50
  parser.add_argument('speakers_file', help='File with the number of speakers')
51
  parser.add_argument('device', help='Device to use for PyTorch inference')
52
+ parser.add_argument('vocals', help='Vocals or not')
53
  args = parser.parse_args()
54
 
55
  vocals_folder = "vocals"
 
67
  extension = "wav"
68
  for i in range(speakers):
69
  file = f'{vocals_folder}/{input_name}_speaker{i:003d}.{extension}'
70
+ transcribe(file, language_dict[args.language]["transcriber"], args.device, args.vocals)
71
  else:
72
  extension = "mp3"
73
  file = f'{vocals_folder}/{input_name}.{extension}'
74
+ transcribe(file, language_dict[args.language]["transcriber"], args.device, args.vocals)