File size: 5,906 Bytes
5b106c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import os

abs_path = os.path.abspath('.')
base_dir = os.path.dirname(os.path.dirname(abs_path))
os.environ['TRANSFORMERS_CACHE'] = os.path.join(base_dir, 'models_cache')

import torch
# Details: https://huggingface.co/docs/diffusers/optimization/fp16#enable-cudnn-autotuner
torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True
from transformers import pipeline, AutoTokenizer, AutoFeatureExtractor, AutoConfig, WhisperProcessor, WhisperForConditionalGeneration, WhisperTokenizer, WhisperFeatureExtractor
from typing import Union, BinaryIO
# from optimum.bettertransformer import BetterTransformer

language = '<|bn|>'
# language = '<|en|>'
task = "transcribe"  # transcribe or translate

# model_name = 'openai/whisper-tiny.en'
# model_name = 'openai/whisper-base.en'
# model_name = 'openai/whisper-small.en'
# model_name = 'openai/whisper-medium' 
## v2: trained on more epochs with regularization
# model_name = 'openai/whisper-large-v2' 

## bangla
# model_name = 'Rakib/whisper-tiny-bn' 
#model_name = 'anuragshas/whisper-small-bn' 
# model_name = 'anuragshas/whisper-large-v2-bn'
# model_name = "Rakib/whisper-small-bn"
# model_name = "Rakib/whisper-small-bn-all"
# model_name = "Rakib/whisper-small-bn-all-600"
# model_name = "Rakib/whisper-small-bn-all-600-v2"
model_name = "Rakib/whisper-small-bn-crblp"

## lets you know the device count: cuda:0 or cuda:1
# print(torch.cuda.device_count())

device = 0 if torch.cuda.is_available() else -1
# device = -1 #Exclusively CPU

print(f"Using device: {'GPU' if device==0 else 'CPU'}")

if device !=0:
    print("[Warning!] Using CPU could hamper performance")

print("Loading Tokenizer for ASR Speech-to-Text Model...\n" + "*" * 100)
# tokenizer = AutoTokenizer.from_pretrained(model_name, language=language, task=task)
# tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer = WhisperTokenizer.from_pretrained(model_name)
# tokenizer(['�', '�্র'],add_prefix_space=True, add_special_tokens=False).input_ids

print("Loading Feature Extractor for ASR Speech-to-Text Model...\n" + "*" * 100)
# feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name)

print("Loading Config for ASR Speech-to-Text Model...\n" + "*" * 100)
config = AutoConfig.from_pretrained(model_name)

print("Loading Processor for ASR Speech-to-Text Model...\n" + "*" * 100)
processor = WhisperProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)

print("Loading WHISPER ASR Speech-to-Text Model...\n" + "*" * 100)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

## BetterTransformer (No Need if PyTorch 2.0 works!!) 
## (currently 2secs faster inference than PyTorch 2.0 )
# model = WhisperForConditionalGeneration.from_pretrained(model_name)
# model = BetterTransformer.transform(model)

## bitsandbytes (only Linux & GPU) (requires conda env with conda-based pytorch!!!)
## currently only reduces size. slower inference than native models!!!
## from_pretrained doc: https://huggingface.co/docs/transformers/v4.25.1/en/main_classes/model#transformers.PreTrainedModel.from_pretrained
# model = WhisperForConditionalGeneration.from_pretrained(model_name, device_map="auto", load_in_8bit=True)

## For PyTorch 2.0 (Only Linux)
# model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device="cuda:0")
##mode options are "default", "reduce-overhead" and "max-autotune". See: https://pytorch.org/get-started/pytorch-2.0/#modes
# model = torch.compile(model, mode="default") 


asr = pipeline(
    task="automatic-speech-recognition",
    model=model,
    tokenizer=tokenizer,
    feature_extractor=feature_extractor,
    # processor=processor, #no effect see: https://github.com/huggingface/transformers/blob/main/src/transformers/pipelines/automatic_speech_recognition.py
    # config=config, #no effect see: https://github.com/huggingface/transformers/blob/main/src/transformers/pipelines/automatic_speech_recognition.py
    device=device,  # for gpu 1 for cpu -1
    ## chunk files longer than 30s into shorted samples
    chunk_length_s=30, 
    ## the amount of overlap (in secs) to be discarded while stitching the inferenced chunks
    ## stride_length_s is a tuple of the left and right stride(overlap) length.
    ## With only 1 number, both sides get the same stride, by default
    ## The stride_length on one side is 1/6th of the chunk_length_s if stride_length no provided
    # stride_length_s=[8, 8],
    stride_length_s=[5, 5],
    # stride_length_s=[6,0],
    batch_size=16,
    ignore_warning=True,
    ## force whisper to generate timestamps so that the chunking and stitching can be accurate
    # return_timestamps=True, 
    generate_kwargs = {
                       'language':language, 
                       'task':task, 
                       'repetition_penalty':1.8,
                       'num_beams':2,
                       'max_new_tokens':448,
                       'early_stopping':True,
                    #    'renormalize_logits':True,
                       # [16867]: �, [16867, 156, 100, 235, 156, 12811]: �্র
                       'bad_words_ids':[[16867], [16867, 156, 100, 235, 156, 12811]],
                    #    'supress_tokens': [16867, 156, 100, 235, 156, 12811],
                       }
)


def transcribe(speech_array: Union[str, BinaryIO], language: str = "en") -> str:
    """
    Transcribes an audio array to text
    Args:
        speech_array (np.ndarray): audio in numpy array format
        language (str): "sv" or "en"
    Returns:
        a string containing transcription
    """
    asr.model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task=task)
    # asr.model.config.max_new_tokens = 448 #default is 448
    
    result = asr(speech_array)

    return str(result["text"])