Youtube-to-HF-Dataset / downloader /whisper_post_processor.py
RamAnanth1's picture
Update downloader/whisper_post_processor.py
d297f8f
from interpreter import WhisperInterpreter
from utils import VIDEO_INFO, json_dump
from yt_dlp.postprocessor import PostProcessor
from datasets import Dataset
import re
class WhisperPP(PostProcessor):
def __init__(self,data,name, **whisper_options):
super().__init__()
self._options = whisper_options
interpreter = WhisperInterpreter(self._options.pop("model_size","base"))
self.data = data
self._process = getattr(interpreter, self._options.pop("mode","transcribe"))
self._write = self._options.pop("write")
self.videos_to_process = self._options.pop("number_videos",0)
self.repoId = name
def run(self, info):
self.to_screen(f"Processing Video {info['id']}")
result = {key: info[key] for key in VIDEO_INFO}
result.update(self._process(info["filepath"], **self._options))
self.to_screen(f"Processed Video {info['id']} and appended results.")
self._update_data(result)
if self._write:
json_dump(result, f"{info['filepath'].split('.')[0]}.json")
return [], info
def _update_data(self, record):
dataType = type(self.data)
if dataType == list:
self.data.append(record)
else:
self.data = self.data.add_item(record)
if self.data.num_rows >= self.videos_to_process and self.videos_to_process != 0:
self.data.push_to_hub(self.repoId)
def get_data(self):
return self.data
def _get_name(self):
if self.data.info.download_checksums is not None:
regex = r"(?<=datasets\/)(.*?)(?=\/resolve)"
repoId = re.compile(regex)
url = list(self.data.info.download_checksums.keys())[0]
return repoId.findall(url)[0]
return ""