metadata
language: ko
tags:
- whisper
- speech-recognition
datasets:
- maxseats/aihub-464-preprocessed-680GB-set-1
metrics:
- cer
Model Name : maxseats/SungBeom-whisper-small-ko-set0
Description
- νμΈνλ λ°μ΄ν°μ : maxseats/aihub-464-preprocessed-680GB-set-1
μ€λͺ
AI hubμ μ£Όμ μμλ³ νμ μμ± λ°μ΄ν°μ μ νμ΅ μ€μ΄μμ.
680GB μ€ μ²«λ²μ§Έ λ°μ΄ν°(10GB)λ₯Ό νμΈνλν λͺ¨λΈμ λΆλ¬μμ, λλ²μ§Έ λ°μ΄ν°λ₯Ό νμ΅ν λͺ¨λΈμ λλ€.
λ§ν¬ : https://huggingface.co/datasets/maxseats/aihub-464-preprocessed-680GB-set-0, https://huggingface.co/datasets/maxseats/aihub-464-preprocessed-680GB-set-1
λ€μ μ½λλ₯Ό ν΅ν΄ μμ±νμ΄μ.
from datasets import load_dataset
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import evaluate
from transformers import WhisperTokenizer, WhisperFeatureExtractor, WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import mlflow
from mlflow.tracking.client import MlflowClient
import subprocess
from huggingface_hub import create_repo, Repository
import os
import shutil
import math # μμ ν
μ€νΈμ©
model_dir = "./tmpp" # μμ X
#########################################################################################################################################
################################################### μ¬μ©μ μ€μ λ³μ #####################################################################
#########################################################################################################################################
model_description = """
- νμΈνλ λ°μ΄ν°μ
: maxseats/aihub-464-preprocessed-680GB-set-1
# μ€λͺ
- AI hubμ μ£Όμ μμλ³ νμ μμ± λ°μ΄ν°μ
μ νμ΅ μ€μ΄μμ.
- 680GB μ€ μ²«λ²μ§Έ λ°μ΄ν°(10GB)λ₯Ό νμΈνλν λͺ¨λΈμ λΆλ¬μμ, λλ²μ§Έ λ°μ΄ν°λ₯Ό νμ΅ν λͺ¨λΈμ
λλ€.
- λ§ν¬ : https://huggingface.co/datasets/maxseats/aihub-464-preprocessed-680GB-set-0, https://huggingface.co/datasets/maxseats/aihub-464-preprocessed-680GB-set-1
"""
# model_name = "openai/whisper-base"
model_name = "maxseats/SungBeom-whisper-small-ko-set0" # λμ : "SungBeom/whisper-small-ko"
# dataset_name = "maxseats/aihub-464-preprocessed-680GB-set-1" # λΆλ¬μ¬ λ°μ΄ν°μ
(νκΉ
νμ΄μ€ κΈ°μ€)
dataset_name = "maxseats/aihub-464-preprocessed-680GB-set-1" # λΆλ¬μ¬ λ°μ΄ν°μ
(νκΉ
νμ΄μ€ κΈ°μ€)
CACHE_DIR = '/mnt/a/maxseats/.finetuning_cache' # μΊμ λλ ν 리 μ§μ
is_test = False # True: μλμ μν λ°μ΄ν°λ‘ ν
μ€νΈ, False: μ€μ νμΈνλ
token = "hf_" # νκΉ
νμ΄μ€ ν ν° μ
λ ₯
training_args = Seq2SeqTrainingArguments(
output_dir=model_dir, # μνλ 리ν¬μ§ν 리 μ΄λ¦μ μ
λ ₯νλ€.
per_device_train_batch_size=16,
gradient_accumulation_steps=2, # λ°°μΉ ν¬κΈ°κ° 2λ°° κ°μν λλ§λ€ 2λ°°μ© μ¦κ°
learning_rate=1e-5,
warmup_steps=500,
# max_steps=2, # epoch λμ μ€μ
num_train_epochs=1, # epoch μ μ€μ / max_stepsμ μ΄κ² μ€ νλλ§ μ€μ
gradient_checkpointing=True,
fp16=True,
evaluation_strategy="steps",
per_device_eval_batch_size=16,
predict_with_generate=True,
generation_max_length=225,
save_steps=1000,
eval_steps=1000,
logging_steps=25,
report_to=["tensorboard"],
load_best_model_at_end=True,
metric_for_best_model="cer", # νκ΅μ΄μ κ²½μ° 'wer'보λ€λ 'cer'μ΄ λ μ ν©ν κ²
greater_is_better=False,
push_to_hub=True,
save_total_limit=5, # μ΅λ μ μ₯ν λͺ¨λΈ μ μ§μ
)
#########################################################################################################################################
################################################### μ¬μ©μ μ€μ λ³μ #####################################################################
#########################################################################################################################################
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
processor: Any
def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
# μΈν λ°μ΄ν°μ λΌλ²¨ λ°μ΄ν°μ κΈΈμ΄κ° λ€λ₯΄λ©°, λ°λΌμ μλ‘ λ€λ₯Έ ν¨λ© λ°©λ²μ΄ μ μ©λμ΄μΌ νλ€. κ·Έλ¬λ―λ‘ λ λ°μ΄ν°λ₯Ό λΆλ¦¬ν΄μΌ νλ€.
# λ¨Όμ μ€λμ€ μΈν λ°μ΄ν°λ₯Ό κ°λ¨ν ν μΉ ν
μλ‘ λ°ννλ μμ
μ μννλ€.
input_features = [{"input_features": feature["input_features"]} for feature in features]
batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
# Tokenizeλ λ μ΄λΈ μνμ€λ₯Ό κ°μ Έμ¨λ€.
label_features = [{"input_ids": feature["labels"]} for feature in features]
# λ μ΄λΈ μνμ€μ λν΄ μ΅λ κΈΈμ΄λ§νΌ ν¨λ© μμ
μ μ€μνλ€.
labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
# ν¨λ© ν ν°μ -100μΌλ‘ μΉννμ¬ loss κ³μ° κ³Όμ μμ 무μλλλ‘ νλ€.
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
# μ΄μ ν ν¬λμ΄μ¦ κ³Όμ μμ bos ν ν°μ΄ μΆκ°λμλ€λ©΄ bos ν ν°μ μλΌλΈλ€.
# ν΄λΉ ν ν°μ μ΄ν μΈμ λ μΆκ°ν μ μλ€.
if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
labels = labels[:, 1:]
batch["labels"] = labels
return batch
def compute_metrics(pred):
pred_ids = pred.predictions
label_ids = pred.label_ids
# pad_tokenμ -100μΌλ‘ μΉν
label_ids[label_ids == -100] = tokenizer.pad_token_id
# metrics κ³μ° μ special tokenλ€μ λΉΌκ³ κ³μ°νλλ‘ μ€μ
pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
cer = 100 * metric.compute(predictions=pred_str, references=label_str)
return {"cer": cer}
# model_dir, ./repo μ΄κΈ°ν
if os.path.exists(model_dir):
shutil.rmtree(model_dir)
os.makedirs(model_dir)
if os.path.exists('./repo'):
shutil.rmtree('./repo')
os.makedirs('./repo')
# νμΈνλμ μ§ννκ³ μ νλ λͺ¨λΈμ processor, tokenizer, feature extractor, model λ‘λ
processor = WhisperProcessor.from_pretrained(model_name, language="Korean", task="transcribe")
tokenizer = WhisperTokenizer.from_pretrained(model_name, language="Korean", task="transcribe")
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
metric = evaluate.load('cer')
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
# Hubλ‘λΆν° "λͺ¨λ μ μ²λ¦¬κ° μλ£λ" λ°μ΄ν°μ
μ λ‘λ(μ΄κ² μ§μ§ μ€λκ±Έλ €μ.)
preprocessed_dataset = load_dataset(dataset_name, cache_dir=CACHE_DIR)
# 30%κΉμ§μ valid λ°μ΄ν°μ
μ ν(μ½λ μλ ν
μ€νΈλ₯Ό μν¨)
if is_test:
preprocessed_dataset["valid"] = preprocessed_dataset["valid"].select(range(math.ceil(len(preprocessed_dataset) * 0.3)))
# training_args κ°μ²΄λ₯Ό JSON νμμΌλ‘ λ³ν
training_args_dict = training_args.to_dict()
# MLflow UI κ΄λ¦¬ ν΄λ μ§μ
mlflow.set_tracking_uri("sqlite:////content/drive/MyDrive/STT_test/mlflow.db")
# MLflow μ€ν μ΄λ¦μ λͺ¨λΈ μ΄λ¦μΌλ‘ μ€μ
experiment_name = model_name
existing_experiment = mlflow.get_experiment_by_name(experiment_name)
if existing_experiment is not None:
experiment_id = existing_experiment.experiment_id
else:
experiment_id = mlflow.create_experiment(experiment_name)
model_version = 1 # λ‘κΉ
νλ €λ λͺ¨λΈ λ²μ (μ΄λ―Έ μ‘΄μ¬νλ©΄, μλ ν λΉ)
# MLflow λ‘κΉ
with mlflow.start_run(experiment_id=experiment_id, description=model_description):
# training_args λ‘κΉ
for key, value in training_args_dict.items():
mlflow.log_param(key, value)
mlflow.set_tag("Dataset", dataset_name) # λ°μ΄ν°μ
λ‘κΉ
trainer = Seq2SeqTrainer(
args=training_args,
model=model,
train_dataset=preprocessed_dataset["train"],
eval_dataset=preprocessed_dataset["valid"], # or "test"
data_collator=data_collator,
compute_metrics=compute_metrics,
tokenizer=processor.feature_extractor,
)
trainer.train()
trainer.save_model(model_dir) # νμ΅ ν λͺ¨λΈ μ μ₯
# Metric λ‘κΉ
metrics = trainer.evaluate()
for metric_name, metric_value in metrics.items():
mlflow.log_metric(metric_name, metric_value)
# MLflow λͺ¨λΈ λ μ§μ€ν°
model_uri = "runs:/{run_id}/{artifact_path}".format(run_id=mlflow.active_run().info.run_id, artifact_path=model_dir)
# μ΄ κ° μ΄μ©ν΄μ νκΉ
νμ΄μ€ λͺ¨λΈ μ΄λ¦ μ€μ μμ
model_details = mlflow.register_model(model_uri=model_uri, name=model_name.replace('/', '-')) # λͺ¨λΈ μ΄λ¦μ '/'λ₯Ό '-'λ‘ λ체
# λͺ¨λΈ Description
client = MlflowClient()
client.update_model_version(name=model_details.name, version=model_details.version, description=model_description)
model_version = model_details.version # λ²μ μ 보 νκΉ
νμ΄μ€ μ
λ‘λ μ μ¬μ©
## νκΉ
νμ΄μ€ λ‘κ·ΈμΈ
while True:
if token =="exit":
break
try:
result = subprocess.run(["huggingface-cli", "login", "--token", token])
if result.returncode != 0:
raise Exception()
break
except Exception as e:
token = input("Please enter your Hugging Face API token: ")
os.environ["HUGGINGFACE_HUB_TOKEN"] = token
# 리ν¬μ§ν 리 μ΄λ¦ μ€μ
repo_name = "maxseats/" + model_name.replace('/', '-') + '-' + str(model_version) # νκΉ
νμ΄μ€ λ ν¬μ§ν 리 μ΄λ¦ μ€μ
# 리ν¬μ§ν 리 μμ±
create_repo(repo_name, exist_ok=True, token=token)
# 리ν¬μ§ν 리 ν΄λ‘
repo = Repository(local_dir='./repo', clone_from=f"{repo_name}", use_auth_token=token)
# model_dir νμν νμΌ λ³΅μ¬
max_depth = 1 # μνν μ΅λ κΉμ΄
for root, dirs, files in os.walk(model_dir):
depth = root.count(os.sep) - model_dir.count(os.sep)
if depth < max_depth:
for file in files:
# νμΌ κ²½λ‘ μμ±
source_file = os.path.join(root, file)
# λμ ν΄λμ 볡μ¬
shutil.copy(source_file, './repo')
# ν ν¬λμ΄μ λ€μ΄λ‘λ λ° λ‘컬 λλ ν 리μ μ μ₯
tokenizer.save_pretrained('./repo')
readme = f"""
---
language: ko
tags:
- whisper
- speech-recognition
datasets:
- {dataset_name}
metrics:
- cer
---
# Model Name : {model_name}
# Description
{model_description}
"""
# λͺ¨λΈ μΉ΄λ λ° κΈ°ν λ©νλ°μ΄ν° νμΌ μμ±
with open("./repo/README.md", "w") as f:
f.write(readme)
# νμΌ μ»€λ° νΈμ
repo.push_to_hub(commit_message="Initial commit")
# ν΄λμ νμ λ΄μ© μμ
shutil.rmtree(model_dir)
shutil.rmtree('./repo')