import evaluate import numpy as np from datasets import load_dataset from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, ) dataset_id = "google/fleurs" model_id = "facebook/xlm-v-base" metric_name = "accuracy" # Keep only the raw transcription and the language id (which we'll use as label) columns_to_remove = [ "audio", "id", "num_samples", "path", "transcription", "gender", "language", "lang_group_id", ] train, val = load_dataset(dataset_id, "all", split=["train", "validation"], ignore_verifications=True) # Build the label2id and id2label dictionaries unique_langs = set() label2id = {} id2label = {} for lang, lang_id in zip(val["language"], val["lang_id"]): if lang not in unique_langs: unique_langs.add(lang) id2label[lang_id] = lang label2id[lang] = lang_id id2label = dict(sorted(id2label.items(), key=lambda item: item[0])) label2id = dict(sorted(label2id.items(), key=lambda item: item[1])) train = train.remove_columns(columns_to_remove) val = val.remove_columns(columns_to_remove) train = train.rename_column("raw_transcription", "text") val = val.rename_column("raw_transcription", "text") train = train.rename_column("lang_id", "label") val = val.rename_column("lang_id", "label") train = train.shuffle(seed=42) val = val.shuffle(seed=42) tokenizer = AutoTokenizer.from_pretrained(model_id) def preprocess(data): return tokenizer(data["text"], truncation=True) processed_train = train.map(preprocess, batched=True) processed_val = val.map(preprocess, batched=True) print(processed_train) print(processed_val) # Fine-tune the model model = AutoModelForSequenceClassification.from_pretrained( model_id, num_labels=len(id2label), label2id=label2id, id2label=id2label, ignore_mismatched_sizes=True, ) args = TrainingArguments( "xlm-v-base-language-id", learning_rate=3e-5, warmup_ratio=0.1, per_device_train_batch_size=16, gradient_accumulation_steps=4, per_device_eval_batch_size=16, num_train_epochs=5, load_best_model_at_end=True, metric_for_best_model=metric_name, evaluation_strategy="epoch", save_strategy="epoch", logging_steps=10, fp16=True, push_to_hub=True, ) metric = evaluate.load(metric_name) def compute_metrics(eval_pred): predictions = np.argmax(eval_pred.predictions, axis=1) return metric.compute(predictions=predictions, references=eval_pred.label_ids) trainer = Trainer( model, args, train_dataset=processed_train, eval_dataset=processed_val, tokenizer=tokenizer, compute_metrics=compute_metrics, ) trainer.train()