import os abs_path = os.path.abspath('.') base_dir = os.path.dirname(os.path.dirname(abs_path)) os.environ['TRANSFORMERS_CACHE'] = os.path.join(base_dir, 'models_cache') import torch # Details: torch.backends.cudnn.benchmark = True torch.backends.cuda.matmul.allow_tf32 = True from transformers import pipeline, AutoTokenizer, AutoFeatureExtractor, AutoConfig, WhisperProcessor, WhisperForConditionalGeneration, WhisperTokenizer, WhisperFeatureExtractor from typing import Union, BinaryIO # from optimum.bettertransformer import BetterTransformer language = '<|bn|>' # language = '<|en|>' task = "transcribe" # transcribe or translate # model_name = 'openai/whisper-tiny.en' # model_name = 'openai/whisper-base.en' # model_name = 'openai/whisper-small.en' # model_name = 'openai/whisper-medium' ## v2: trained on more epochs with regularization # model_name = 'openai/whisper-large-v2' ## bangla # model_name = 'Rakib/whisper-tiny-bn' #model_name = 'anuragshas/whisper-small-bn' # model_name = 'anuragshas/whisper-large-v2-bn' # model_name = "Rakib/whisper-small-bn" # model_name = "Rakib/whisper-small-bn-all" # model_name = "Rakib/whisper-small-bn-all-600" # model_name = "Rakib/whisper-small-bn-all-600-v2" model_name = "Rakib/whisper-small-bn-crblp" ## lets you know the device count: cuda:0 or cuda:1 # print(torch.cuda.device_count()) device = 0 if torch.cuda.is_available() else -1 # device = -1 #Exclusively CPU print(f"Using device: {'GPU' if device==0 else 'CPU'}") if device !=0: print("[Warning!] Using CPU could hamper performance") print("Loading Tokenizer for ASR Speech-to-Text Model...\n" + "*" * 100) # tokenizer = AutoTokenizer.from_pretrained(model_name, language=language, task=task) # tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) tokenizer = WhisperTokenizer.from_pretrained(model_name) # tokenizer(['�', '�্র'],add_prefix_space=True, add_special_tokens=False).input_ids print("Loading Feature Extractor for ASR Speech-to-Text Model...\n" + "*" * 100) # feature_extractor = AutoFeatureExtractor.from_pretrained(model_name) feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name) print("Loading Config for ASR Speech-to-Text Model...\n" + "*" * 100) config = AutoConfig.from_pretrained(model_name) print("Loading Processor for ASR Speech-to-Text Model...\n" + "*" * 100) processor = WhisperProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer) print("Loading WHISPER ASR Speech-to-Text Model...\n" + "*" * 100) model = WhisperForConditionalGeneration.from_pretrained(model_name) ## BetterTransformer (No Need if PyTorch 2.0 works!!) ## (currently 2secs faster inference than PyTorch 2.0 ) # model = WhisperForConditionalGeneration.from_pretrained(model_name) # model = BetterTransformer.transform(model) ## bitsandbytes (only Linux & GPU) (requires conda env with conda-based pytorch!!!) ## currently only reduces size. slower inference than native models!!! ## from_pretrained doc: # model = WhisperForConditionalGeneration.from_pretrained(model_name, device_map="auto", load_in_8bit=True) ## For PyTorch 2.0 (Only Linux) # model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device="cuda:0") ##mode options are "default", "reduce-overhead" and "max-autotune". See: # model = torch.compile(model, mode="default") asr = pipeline( task="automatic-speech-recognition", model=model, tokenizer=tokenizer, feature_extractor=feature_extractor, # processor=processor, #no effect see: # config=config, #no effect see: device=device, # for gpu 1 for cpu -1 ## chunk files longer than 30s into shorted samples chunk_length_s=30, ## the amount of overlap (in secs) to be discarded while stitching the inferenced chunks ## stride_length_s is a tuple of the left and right stride(overlap) length. ## With only 1 number, both sides get the same stride, by default ## The stride_length on one side is 1/6th of the chunk_length_s if stride_length no provided # stride_length_s=[8, 8], stride_length_s=[5, 5], # stride_length_s=[6,0], batch_size=16, ignore_warning=True, ## force whisper to generate timestamps so that the chunking and stitching can be accurate # return_timestamps=True, generate_kwargs = { 'language':language, 'task':task, 'repetition_penalty':1.8, 'num_beams':2, 'max_new_tokens':448, 'early_stopping':True, # 'renormalize_logits':True, # [16867]: �, [16867, 156, 100, 235, 156, 12811]: �্র 'bad_words_ids':[[16867], [16867, 156, 100, 235, 156, 12811]], # 'supress_tokens': [16867, 156, 100, 235, 156, 12811], } ) def transcribe(speech_array: Union[str, BinaryIO], language: str = "en") -> str: """ Transcribes an audio array to text Args: speech_array (np.ndarray): audio in numpy array format language (str): "sv" or "en" Returns: a string containing transcription """ asr.model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task=task) # asr.model.config.max_new_tokens = 448 #default is 448 result = asr(speech_array) return str(result["text"])