Spaces:

rishitpant
/

hf_workshop_deployment_IITM

Sleeping

App Files Files Community

hf_workshop_deployment_IITM / train_model.py

rishitpant

Upload train_model.py

de0e425 verified about 2 months ago

raw

history blame contribute delete

1.79 kB

	# -- coding: utf-8 --
	"""train_model.ipynb

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/1BMInZz4vjJ1PfgTbbqIknpJYcbM5cwV0
	"""

	import torch
	import numpy as np
	from datasets import load_dataset
	from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

	print("Downloading dataset...")
	dataset = load_dataset("papluca/language-identification", split="train")

	target_langs = {'en', 'fr', 'es', 'de'}
	filtered_dataset = dataset.filter(lambda example: example['labels'] in target_langs)

	label2id = {"en": 0, "fr": 1, "es": 2, "de": 3}
	id2label = {0: "en", 1: "fr", 2: "es", 3: "de"}

	model_ckpt = "distilbert-base-multilingual-cased"
	tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

	def preprocess(examples):
	tokenized = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=64)
	tokenized["labels"] = [label2id[lang] for lang in examples["labels"]]
	return tokenized

	print("Preprocessing data...")

	train_subset = filtered_dataset.shuffle(seed=42).select(range(1500))
	tokenized_data = train_subset.map(preprocess, batched=True)

	model = AutoModelForSequenceClassification.from_pretrained(
	model_ckpt,
	num_labels=4,
	id2label=id2label,
	label2id=label2id
	)

	args = TrainingArguments(
	output_dir="my_real_model",
	learning_rate=2e-5,
	per_device_train_batch_size=16,
	num_train_epochs=2,
	weight_decay=0.01,
	save_strategy="no",
	use_cpu=True

	trainer = Trainer(
	model=model,
	args=args,
	train_dataset=tokenized_data,
	tokenizer=tokenizer,
	)

	print("Starting training...")
	trainer.train()

	print("Saving model to './production_model'...")
	trainer.save_model("production_model")
	print("Done!")