from pprint import pprint | |
from datasets import load_dataset | |
from transformers.pipelines import pipeline | |
model_alias = "kotoba-tech/kotoba-whisper-v1.1" | |
pipe = pipeline(model=model_alias, | |
punctuator=True, | |
stable_ts=True, | |
chunk_length_s=15, | |
batch_size=16, | |
trust_remote_code=True) | |
dataset = load_dataset("kotoba-tech/kotoba-whisper-eval", split="train") | |
for i in dataset: | |
if i["audio"]["path"] == "long_interview_1.mp3": | |
i["audio"]["array"] = i["audio"]["array"][:7938000] | |
prediction = pipe( | |
i["audio"], | |
return_timestamps=True, | |
generate_kwargs={"language": "japanese", "task": "transcribe"} | |
) | |
pprint(prediction) | |
input() |