File size: 3,037 Bytes
4d57eee
 
1e1be2d
4d57eee
1e1be2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b004aea
4d57eee
 
 
 
1e1be2d
 
b004aea
 
1e1be2d
4d57eee
1e1be2d
 
4d57eee
7bfb36e
1e1be2d
 
 
b004aea
 
 
4d57eee
 
 
 
fdb83d2
4d57eee
fdb83d2
1e1be2d
b004aea
4d57eee
fdb83d2
 
 
 
 
 
 
 
 
 
1e1be2d
 
 
 
 
 
b004aea
1e1be2d
 
 
b004aea
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import os
import argparse
from lang_list import LANGUAGE_NAME_TO_CODE, WHISPER_LANGUAGES

# For pyannote.audio diarize
from pyannote.audio import Model
model = Model.from_pretrained("pyannote/segmentation-3.0", use_auth_token="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")

language_dict = {}
# Iterate over the LANGUAGE_NAME_TO_CODE dictionary
for language_name, language_code in LANGUAGE_NAME_TO_CODE.items():
    # Extract the language code (the first two characters before the underscore)
    lang_code = language_code.split('_')[0].lower()
    
    # Check if the language code is present in WHISPER_LANGUAGES
    if lang_code in WHISPER_LANGUAGES:
        # Construct the entry for the resulting dictionary
        language_dict[language_name] = {
            "transcriber": lang_code,
            "translator": language_code
        }

def transcribe(audio_file, language, device, vocals):
    output_folder = "transcriptions"

    # Transcribe audio file
    model = "large-v2"
    # word_timestamps = True
    print_progress = True
    compute_type = "float16"
    fp16 = True
    batch_size = 8
    verbose = False
    min_speakers = 1
    max_speakers = 10
    threads = 4
    output_format = "srt"
    hf_token = "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"
    command = f'whisperx {audio_file} --model {model} --batch_size {batch_size} --compute_type {compute_type} \
--output_dir {output_folder} --output_format {output_format} --verbose {verbose} --language {language} \
--fp16 {fp16} --threads {threads} --print_progress {print_progress} --device {device}'
    if vocals:
        command += f' --diarize --max_speakers {max_speakers} --min_speakers {min_speakers} --hf_token {hf_token}'
    os.system(command)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Transcribe audio files')
    parser.add_argument('input_files', help='Input audio files')
    parser.add_argument('language', help='Language of the audio file')
    parser.add_argument('speakers_file', help='File with the number of speakers')
    parser.add_argument('device', help='Device to use for PyTorch inference')
    parser.add_argument('vocals', help='Vocals or not')
    args = parser.parse_args()

    vocals_folder = "vocals"

    with open(args.speakers_file, 'r') as f:
        speakers = f.read().splitlines()
        speakers = int(speakers[0])

    with open(args.input_files, 'r') as f:
        inputs = f.read().splitlines()
    for input in inputs:
        input_file, _ = input.split('.')
        _, input_name = input_file.split('/')
        if speakers > 0:
            extension = "wav"
            for i in range(speakers):
                file = f'{vocals_folder}/{input_name}_speaker{i:003d}.{extension}'
                transcribe(file, language_dict[args.language]["transcriber"], args.device, args.vocals)
        else:
            extension = "mp3"
            file = f'{vocals_folder}/{input_name}.{extension}'
            transcribe(file, language_dict[args.language]["transcriber"], args.device, args.vocals)