Spaces:
Sleeping
Sleeping
| from transformers import AutoTokenizer, MT5ForConditionalGeneration | |
| from transformers import T5Tokenizer | |
| import streamlit as st | |
| import pandas as pd | |
| from datasets import Dataset | |
| import torch | |
| from datasets import Dataset, DatasetDict | |
| from transformers import Trainer, TrainingArguments | |
| tokenizer = T5Tokenizer.from_pretrained('google/mt5-base') | |
| model = MT5ForConditionalGeneration.from_pretrained("google/mt5-base") | |
| #st.write(model) | |
| df = pd.read_csv('proverbs.csv') | |
| df | |
| dataset = Dataset.from_pandas(df) | |
| def preprocess_function(examples): | |
| inputs = examples['Proverb'] | |
| targets = examples['Meaning'] | |
| model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length") | |
| with tokenizer.as_target_tokenizer(): | |
| labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length") | |
| model_inputs["labels"] = labels["input_ids"] | |
| return model_inputs | |
| tokenized_dataset = dataset.map(preprocess_function, batched=True) | |
| dataset_split = tokenized_dataset.train_test_split(test_size=0.2) | |
| train_dataset = dataset_split['train'] | |
| test_dataset = dataset_split['test'] | |
| print(f"Training dataset size: {len(train_dataset)}") | |
| print(f"Testing dataset size: {len(test_dataset)}") | |
| training_args = TrainingArguments( | |
| output_dir="./results", | |
| evaluation_strategy="epoch", | |
| learning_rate=2e-5, | |
| per_device_train_batch_size=2, | |
| per_device_eval_batch_size=2, | |
| num_train_epochs=3, | |
| weight_decay=0.01, | |
| save_total_limit=2, | |
| save_steps=500, | |
| ) | |
| # Initialize Trainer | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=tokenized_dataset, | |
| eval_dataset=tokenized_dataset, # Typically you'd have a separate eval dataset | |
| ) | |
| # Fine-tune the model | |
| trainer.train() | |
| model.save_pretrained("./fine-tuned-mt5-marathi-proverbs") | |
| tokenizer.save_pretrained("./fine-tuned-mt5-marathi-proverbs") | |
| repo_id = "grpathak22/mt5-proverbs" | |
| # # Log in and create the repo | |
| # api = HfApi() | |
| # api.login(token=hf_token) | |
| # api.create_repo(repo_id, exist_ok=True) | |
| # # Initialize the Repository object | |
| # repo = Repository(local_dir="./fine-tuned-mt5-marathi-proverbs", clone_from=repo_id) | |
| # # Push the model and tokenizer to the Hugging Face Hub | |
| # repo.push_to_hub(commit_message="Add fine-tuned MT5 model for Marathi proverbs") | |
| prompt = "अति शहाणा त्याचा बैल रिकामा" | |
| # Tokenize the input prompt | |
| input_ids = tokenizer.encode(prompt, return_tensors='pt') | |
| # Generate the output | |
| output_ids = model.generate(input_ids, max_length=256) | |
| # Decode the output to text | |
| output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) | |