anhdungitvn/vi-mistral-x
The model, referred to as anhdungitvn/vi-mistral-x
, has been meticulously developed with an emphasis on processing the Vietnamese language.
This development was achieved through the method of continual pretraining on the mistralai/Mistral-7B-v0.1 model, which has been significantly specialized and adapted for the Vietnamese language with general domain capabilities. Furthermore, it possesses the flexibility to be fine-tuned for a wide range of specialized domains. This adaptability renders the model particularly suited to fulfilling the specific requirements of businesses, researchers, and developers engaged in work involving the Vietnamese language across various sectors.
Model Details
Model Description
- Developed by: James
- Model type: Mistral
- Model class MistralForCausalLM
- Language(s): Vietnamese
- License: Not yet decided
- Finetuned from model: mistralai/Mistral-7B-v0.1
Model Sources
- Repository: anhdungitvn/vi-mistral-x
- Paper: Enhancing Memory and Computational Efficiency in Training Transformer-Based Models (unpublished)
- Technical Report: Vi-Mistral-X: Building a Vietnamese Language Model with Advanced Continual Pre-training
Uses
Direct Use
from transfomers import AutoTokenizer
from transfomers import AutoModelForCausalLM
model_name_or_path = "anhdungitvn/vi-mistral-x"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
text = "Ngân hàng lớn nhất Việt Nam là"
outputs = model(**tokenizer(text, return_tensor='pt'))
Downstream Use
SFT
from transfomers import AutoTokenizer
from transfomers import AutoModelForCausalLM
model_name_or_path = "anhdungitvn/vi-mistral-x"
tokenizer = AutoTokenizer.from_pretrained(
model_name_or_path,
model_max_length=4096 # customize it yourself
)
model = AutoModelForCausalLM.from_pretrained(
model_name_or_path,
attn_implementation='sdpa' # customize it yourself
)
from peft import LoraConfig
from peft import PeftConfig
from peft import PeftModel
from peft import get_peft_model
config = LoraConfig(
r=64,
lora_alpha=128,
lora_dropout=0.00,
target_module=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] # customize it yourself
)
model = get_peft_model(model, config)
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
dataset = load_dataset("anhdungitvn/vi-bfsi-sft-dummy-1k") # customize it yourself
def process_function(examples):
# customize it yourself
return examples["text"]
dataset = dataset.map(
process_function,
batched=True,
num_proc=64,
remove_columns=dataset["train"].column_names,
desc="Preprocessing"
)
data_collator = DataCollatorForLanguageModeling(
tokenizer, mlm=False
)
import evaluate
metric = evaluate.load("accuracy")
def compute_metrics(eval_preds):
preds, labels = eval_preds
labels = labels[:, 1:].reshape(-1)
preds = preds[:, :-1].reshape(-1)
return metric.compute(
predictions=preds,
references=labels
)
def preprocess_logits_for_metrics(logits, labels):
if isinstance(logits, tuple):
logits = logits[0]
return logits.argmax(dim=-1)
from transformers import TrainingArguments
from transformers import Trainer
args = TrainingArguments(
output_dir="output_dir",
per_device_train_batch_size=32,
per_device_eval_batch_size=32,
gradient_accumulation_steps=1,
num_train_epochs=1,
max_steps=10, # for test
warmup_ratio=0.01, # warmup_steps
lr_scheduler_type='cosine', # cosine (fast), linear (stable), constant (test)
learning_rate=0.000010, # 10µ
weight_decay=0.0001,
gradient_checkpointing=True,
gradient_checkpointing_kwargs={"use_reentrant": True},
fp16=False,
bf16=True,
remove_unused_columns=False,
dataloader_drop_last=True,
logging_strategy="steps",
logging_steps=logging_steps,
save_strategy="steps",
save_steps=save_steps,
save_total_limit=1,
evaluation_strategy="steps",
eval_steps=eval_steps,
push_to_hub=False,
# hub_private_repo=True,
# hub_model_id="anhdungitvn/my-model",
# hub_token=hf_token_write,
run_name='vi-mistral-x-sft'
)
trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=args,
data_collator=data_collator,
train_dataset=dataset["train"],
eval_dataset=dataset["test"],
compute_metrics=compute_metrics,
preprocess_logits_for_metrics=preprocess_logits_for_metrics
)
model.config.use_cache = False
trainer.train()
DPO
from datasets import load_dataset
dataset = load_dataset("anhdungitvn/vi-bfsi-dpo-dummy-1k") # customize it yourself
def process_function(examples):
# customize it yourself
return examples["text"]
from trl import DPOTrainer as
trainer = Trainer(
model,
model_ref,
**kwargs
)
KTO
from datasets import load_dataset
dataset = load_dataset("anhdungitvn/vi-bfsi-kto-dummy-1k") # customize it yourself
def process_function(examples):
# customize it yourself
return examples["text"]
from trl import KTOTrainer as Trainer
from trl import KTOConfig
args = KTOConfig(
beta=0.1,
desirable_weight=1.0,
undesirable_weight=1.0,
)
trainer = Trainer(
model,
model_ref,
args=args,
**kwargs
)
QLoRA
from transformers import BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
llm_int8_threshold=6.0,
llm_int8_has_fp16_weight=False,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
)
model = AutoModelForCausalLM.from_pretrained(
"anhdungitvn/vi-mistral-x",
load_in_4bit=True, # GPU Quantization
quantization_config=bnb_config,
torch_dtype=torch.bfloat16, # GPU
)
Evaluation on VMLU
VMLU: https://vmlu.ai
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
model_name_or_path = "anhdungitvn/vi-mistral-x"
tokenizer = AutoTokenizer.from_pretrained(
model_name_or_path,
padding_side = "left"
)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
model_name_or_path,
torch_dtype=torch.float16
)
class InferModel:
def __init__(self, model, tokenizer, **kwargs):
self.model = model
self.tokenizer = tokenizer
def __call__(self, prompt, **kwargs):
outputs = self.model.generate(**self.tokenizer(prompt, return_tensors="pt"))
text = self.tokenizer.batch_decode(outputs)[0]
text = text.split("Đáp án: ")[-1].lstrip()
return text
infer = InferModel(model=model, tokenizer=tokenizer)
from datasets import load_dataset
dataset_name_or_path = "anhdungitvn/vmlu_v1.5"
dataset = load_dataset(dataset_name_or_path)
test_dataset = dataset["test"]
import re
import os
import pandas as pd
from tqdm.auto import tqdm
all_res = []
with tqdm(total=len(test_dataset)) as pbar:
for example in test_dataset:
answer = infer(example["prompt"])
all_res.append({
"id": example['id'],
"prompt": example["prompt"],
"question": example["question"],
"answer": answer
})
pbar.update(1)
df = pd.DataFrame(all_res)
df['answer'] = df.answer.map(lambda x: x[0].lower())
df['answer'] = df['answer'].map(lambda x: re.sub(r'[^abcde]', '', x))
submission_csv = df[['id', 'answer']].to_csv('submission.csv', index=None)
Training
Hyperparams:
- Learning rate: 20μ, 5μ (PT); 50μ, 20μ (SFT)
- per_device_train_batch_size: 4
Progress: ⬛⬛⬜⬜⬜⬜⬜⬜⬜⬜ 16%
- Epoch: 0/1
- Steps: 10000/63500
- Tokens: 1310720000/8323137536
- Data: ~3GB/20GB
This session is being updated...
Alignment
SFT_1
SFT_2
This session is being updated...
Evaluation
VNLU
Pretrained Model
| # | MODEL | CREATOR | ACCESS | EVALUATION DATE | STEM | SOCIAL SCIENCE | HUMANITIES | OTHERS | AVG |
|----|----------------|---------------|--------|-----------------|-------|----------------|------------|--------|-------|
| 1 | GPT-4 | OpenAI | API | 08/01/2024 | 63.84 | 71.78 | 66.14 | 60.37 | 65.53 |
| 2 | gemini | Google | API | 30/01/2024 | 42.8 | 60.31 | 55.35 | 51.30 | 51.03 |
| 3 | ChatGPT | OpenAI | API | 08/01/2024 | 43.24 | 51.67 | 46.96 | 46.32 | 46.33 |
| 4 | ViGPT-1.6B-v1 | Vin BigData | Private| 08/01/2024 | 35.06 | 48.72 | 47.20 | 42.54 | 42.34 |
| 5 | gemma-7b-it | Google | Weight | 22/02/2024 | 39.95 | 44.93 | 43.39 | 40.11 | 41.9 |
| 6 | Qwen-7B | Alibaba Cloud | Weight | 08/01/2024 | 30.64 | 35.07 | 34.15 | 32.68 | 32.81 |
| 7 | vi-mistral-x | James | Private| 10/04/2024 | 24.88 | 34.08 | 35.11 | 29.26 | 30.32 |
| 8 | gemma-2b-it | Google | Weight | 22/02/2024 | 24.39 | 29.59 | 31.01 | 26.81 | 27.72 |
| 9 | sealion7b | AI Singapore | Weight | 08/01/2024 | 26.28 | 28.57 | 27.66 | 27.34 | 26.73 |
| 10 | bloom-1b7 | BigScience | Weight | 08/01/2024 | 25.13 | 25.09 | 26.34 | 25.19 | 25.51 |
Aligned Model
| # | MODEL | CREATOR | ACCESS | BASE MODEL | EVALUATION DATE | STEM | SOCIAL SCIENCE | HUMANITIES | OTHERS | AVG |
|----|------------------------|-----------------------|--------|---------------------|-----------------|-------|----------------|------------|--------|-------|
| 1 | VNPTAI.IO-14B | VNPT AI | Private| Qwen1.5-14B-Chat | 11/03/2024 | 51.64 | 61.75 | 58.09 | 54.51 | 55.83 |
| 2 | Vistral-7B-Chat | UONLP x Ontocord | Weight | Mistral-7B-v0.1 | 16/01/2024 | 43.32 | 57.02 | 55.12 | 48.01 | 50.07 |
| 3 | SeaLLM-7b-v2 | DAMO Academy | Weight | llama-2-7b | 15/02/2024 | 39.95 | 52.02 | 49.38 | 45.27 | 45.79 |
| 4 | vi-mistral-x | James | Private| vi-mistral-x | 10/04/2024 | 31.13 | 48.81 | 48.36 | 40.44 | 40.97 |
| 5 | bloomz-7b1 | BigScience | Weight | Bloom-7b1 | 08/01/2024 | 32.63 | 45.73 | 41.85 | 39.89 | 38.87 |
| 6 | T-Llama | FPTU HCM | Weight | llama-2-7b | 18/03/2024 | 32.2 | 43.15 | 40.31 | 36.57 | 37.28 |
| 7 | vbd-llama2-7b-50b-chat | Vin BigData | Weight | llama-2-7b | 08/01/2024 | 31.45 | 40.34 | 40.24 | 39.62 | 36.98 |
| 8 | vietcuna-3b | Virtual Interactive | Weight | bloomz-3b | 08/01/2024 | 30.12 | 39.92 | 37.86 | 33.83 | 34.79 |
| 9 | bloomz-1b7 | BigScience | Weight | Bloom-1b7 | 08/01/2024 | 29.72 | 40.17 | 34.73 | 33.41 | 33.65 |
| 10 | SeaLLM-7B-Hybrid | DAMO Academy | Weight | llama-2-7b | 08/01/2024 | 29.49 | 34.61 | 36.68 | 34.52 | 33.39 |
|category_subcategory |score_PT|score_SFT|diff |
|----------------------------------------------------------------|--------|---------|------|
|total |30.32 |40.97 |10.65 |
|stem_applied_informatics |39.44 |30.36 |-9.08 |
|stem_computer_architecture |31.11 |39.77 |8.66 |
|stem_computer_network |34.64 |36.67 |2.03 |
|stem_discrete_mathematics |23.64 |61.99 |38.35 |
|stem_electrical_engineering |22.73 |33.33 |10.6 |
|stem_elementary_mathematics |19.44 |39.18 |19.74 |
|stem_elementary_science |55.0 |52.94 |-2.06 |
|stem_high_school_biology |15.0 |24.14 |9.14 |
|stem_high_school_chemistry |22.78 |45.61 |22.83 |
|stem_high_school_mathematics |16.22 |40.44 |24.22 |
|stem_high_school_physics |23.33 |37.78 |14.45 |
|stem_introduction_to_chemistry |14.53 |71.91 |57.38 |
|stem_introduction_to_physics |23.12 |61.22 |38.1 |
|stem_introduction_to_programming |29.05 |40.56 |11.51 |
|stem_metrology_engineer |22.7 |51.69 |28.99 |
|stem_middle_school_biology |31.18 |48.81 |17.63 |
|stem_middle_school_chemistry |18.33 |33.91 |15.58 |
|stem_middle_school_mathematics |17.59 |60.56 |42.97 |
|stem_middle_school_physics |21.67 |46.3 |24.63 |
|stem_operating_system |30.56 |45.25 |14.69 |
|stem_statistics_and_probability |10.34 |38.89 |28.55 |
|stem_total |24.88 |41.67 |16.79 |
|other_clinical_pharmacology |26.11 |30.56 |4.45 |
|other_driving_license_certificate |45.61 |34.64 |-10.97|
|other_environmental_engineering |11.7 |20.61 |8.91 |
|other_internal_basic_medicine |34.5 |30.11 |-4.39 |
|other_preschool_pedagogy |34.31 |10.56 |-23.75|
|other_tax_accountant |20.69 |68.33 |47.64 |
|other_tax_civil_servant |41.52 |26.11 |-15.41|
|other_total |29.26 |32.22 |2.96 |
|other_accountant |21.43 |22.97 |1.54 |
|other_civil_servant |27.49 |24.44 |-3.05 |
|humanity_economic_law |29.81 |26.82 |-2.99 |
|humanity_education_law |33.13 |26.01 |-7.12 |
|humanity_elementary_history |49.72 |30.17 |-19.55|
|humanity_high_school_history |31.11 |37.59 |6.48 |
|humanity_high_school_literature |25.56 |46.47 |20.91 |
|humanity_history_of_world_civilization |41.11 |27.78 |-13.33|
|humanity_idealogical_and_moral_cultivation |49.44 |16.67 |-32.77|
|humanity_introduction_to_laws |39.68 |24.44 |-15.24|
|humanity_introduction_to_vietnam_culture |28.33 |41.11 |12.78 |
|humanity_logic |18.97 |34.48 |15.51 |
|humanity_middle_school_history |37.78 |31.13 |-6.65 |
|humanity_middle_school_literature |37.36 |45.81 |8.45 |
|humanity_revolutionary_policy_of_the_vietnamese_commununist_part|36.67 |53.33 |16.66 |
|humanity_vietnamese_language_and_literature |17.24 |57.06 |39.82 |
|humanity_total |35.11 |37.27 |2.16 |
|humanity_administrative_law |37.78 |43.37 |5.59 |
|humanity_business_law |39.11 |53.11 |14.0 |
|humanity_civil_law |41.11 |44.44 |3.33 |
|humanity_criminal_law |38.04 |43.89 |5.85 |
|social_science_middle_school_geography |27.21 |58.33 |31.12 |
|social_science_principles_of_marxism_and_leninism |36.67 |61.67 |25.0 |
|social_science_sociology |39.89 |47.62 |7.73 |
|social_science_business_administration |20.69 |47.78 |27.09 |
|social_science_high_school_civil_education |43.89 |27.59 |-16.3 |
|social_science_high_school_geography |33.33 |56.11 |22.78 |
|social_science_ho_chi_minh_ideology |41.34 |55.17 |13.83 |
|social_science_macroeconomics |21.67 |50.0 |28.33 |
|social_science_microeconomics |23.89 |39.08 |15.19 |
|social_science_middle_school_civil_education |52.25 |48.36 |-3.89 |
|social_science_total |34.08 |48.89 |14.81 |
This session is being updated...
```
- Downloads last month
- 0