Spaces:
Sleeping
Sleeping
import os | |
abs_path = os.path.abspath('.') | |
base_dir = os.path.dirname(os.path.dirname(abs_path)) | |
os.environ['TRANSFORMERS_CACHE'] = os.path.join(base_dir, 'models_cache') | |
import torch | |
# Details: https://huggingface.co/docs/diffusers/optimization/fp16#enable-cudnn-autotuner | |
torch.backends.cudnn.benchmark = True | |
torch.backends.cuda.matmul.allow_tf32 = True | |
from transformers import pipeline, AutoTokenizer, AutoFeatureExtractor, AutoConfig, WhisperProcessor, WhisperForConditionalGeneration, WhisperTokenizer, WhisperFeatureExtractor | |
from typing import Union, BinaryIO | |
# from optimum.bettertransformer import BetterTransformer | |
language = '<|bn|>' | |
# language = '<|en|>' | |
task = "transcribe" # transcribe or translate | |
# model_name = 'openai/whisper-tiny.en' | |
# model_name = 'openai/whisper-base.en' | |
# model_name = 'openai/whisper-small.en' | |
# model_name = 'openai/whisper-medium' | |
## v2: trained on more epochs with regularization | |
# model_name = 'openai/whisper-large-v2' | |
## bangla | |
# model_name = 'Rakib/whisper-tiny-bn' | |
#model_name = 'anuragshas/whisper-small-bn' | |
# model_name = 'anuragshas/whisper-large-v2-bn' | |
# model_name = "Rakib/whisper-small-bn" | |
# model_name = "Rakib/whisper-small-bn-all" | |
# model_name = "Rakib/whisper-small-bn-all-600" | |
# model_name = "Rakib/whisper-small-bn-all-600-v2" | |
model_name = "Rakib/whisper-small-bn-crblp" | |
## lets you know the device count: cuda:0 or cuda:1 | |
# print(torch.cuda.device_count()) | |
device = 0 if torch.cuda.is_available() else -1 | |
# device = -1 #Exclusively CPU | |
print(f"Using device: {'GPU' if device==0 else 'CPU'}") | |
if device !=0: | |
print("[Warning!] Using CPU could hamper performance") | |
print("Loading Tokenizer for ASR Speech-to-Text Model...\n" + "*" * 100) | |
# tokenizer = AutoTokenizer.from_pretrained(model_name, language=language, task=task) | |
# tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) | |
tokenizer = WhisperTokenizer.from_pretrained(model_name) | |
# tokenizer(['�', '�্র'],add_prefix_space=True, add_special_tokens=False).input_ids | |
print("Loading Feature Extractor for ASR Speech-to-Text Model...\n" + "*" * 100) | |
# feature_extractor = AutoFeatureExtractor.from_pretrained(model_name) | |
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name) | |
print("Loading Config for ASR Speech-to-Text Model...\n" + "*" * 100) | |
config = AutoConfig.from_pretrained(model_name) | |
print("Loading Processor for ASR Speech-to-Text Model...\n" + "*" * 100) | |
processor = WhisperProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer) | |
print("Loading WHISPER ASR Speech-to-Text Model...\n" + "*" * 100) | |
model = WhisperForConditionalGeneration.from_pretrained(model_name) | |
## BetterTransformer (No Need if PyTorch 2.0 works!!) | |
## (currently 2secs faster inference than PyTorch 2.0 ) | |
# model = WhisperForConditionalGeneration.from_pretrained(model_name) | |
# model = BetterTransformer.transform(model) | |
## bitsandbytes (only Linux & GPU) (requires conda env with conda-based pytorch!!!) | |
## currently only reduces size. slower inference than native models!!! | |
## from_pretrained doc: https://huggingface.co/docs/transformers/v4.25.1/en/main_classes/model#transformers.PreTrainedModel.from_pretrained | |
# model = WhisperForConditionalGeneration.from_pretrained(model_name, device_map="auto", load_in_8bit=True) | |
## For PyTorch 2.0 (Only Linux) | |
# model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device="cuda:0") | |
##mode options are "default", "reduce-overhead" and "max-autotune". See: https://pytorch.org/get-started/pytorch-2.0/#modes | |
# model = torch.compile(model, mode="default") | |
asr = pipeline( | |
task="automatic-speech-recognition", | |
model=model, | |
tokenizer=tokenizer, | |
feature_extractor=feature_extractor, | |
# processor=processor, #no effect see: https://github.com/huggingface/transformers/blob/main/src/transformers/pipelines/automatic_speech_recognition.py | |
# config=config, #no effect see: https://github.com/huggingface/transformers/blob/main/src/transformers/pipelines/automatic_speech_recognition.py | |
device=device, # for gpu 1 for cpu -1 | |
## chunk files longer than 30s into shorted samples | |
chunk_length_s=30, | |
## the amount of overlap (in secs) to be discarded while stitching the inferenced chunks | |
## stride_length_s is a tuple of the left and right stride(overlap) length. | |
## With only 1 number, both sides get the same stride, by default | |
## The stride_length on one side is 1/6th of the chunk_length_s if stride_length no provided | |
# stride_length_s=[8, 8], | |
stride_length_s=[5, 5], | |
# stride_length_s=[6,0], | |
batch_size=16, | |
ignore_warning=True, | |
## force whisper to generate timestamps so that the chunking and stitching can be accurate | |
# return_timestamps=True, | |
generate_kwargs = { | |
'language':language, | |
'task':task, | |
'repetition_penalty':1.8, | |
'num_beams':2, | |
'max_new_tokens':448, | |
'early_stopping':True, | |
# 'renormalize_logits':True, | |
# [16867]: �, [16867, 156, 100, 235, 156, 12811]: �্র | |
'bad_words_ids':[[16867], [16867, 156, 100, 235, 156, 12811]], | |
# 'supress_tokens': [16867, 156, 100, 235, 156, 12811], | |
} | |
) | |
def transcribe(speech_array: Union[str, BinaryIO], language: str = "en") -> str: | |
""" | |
Transcribes an audio array to text | |
Args: | |
speech_array (np.ndarray): audio in numpy array format | |
language (str): "sv" or "en" | |
Returns: | |
a string containing transcription | |
""" | |
asr.model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task=task) | |
# asr.model.config.max_new_tokens = 448 #default is 448 | |
result = asr(speech_array) | |
return str(result["text"]) |