Edit model card

anhdungitvn/vi-mistral-x

The model, referred to as anhdungitvn/vi-mistral-x, has been meticulously developed with an emphasis on processing the Vietnamese language. This development was achieved through the method of continual pretraining on the mistralai/Mistral-7B-v0.1 model, which has been significantly specialized and adapted for the Vietnamese language with general domain capabilities. Furthermore, it possesses the flexibility to be fine-tuned for a wide range of specialized domains. This adaptability renders the model particularly suited to fulfilling the specific requirements of businesses, researchers, and developers engaged in work involving the Vietnamese language across various sectors.

Model Details

Model Description

  • Developed by: James
  • Model type: Mistral
  • Model class MistralForCausalLM
  • Language(s): Vietnamese
  • License: Not yet decided
  • Finetuned from model: mistralai/Mistral-7B-v0.1

Model Sources

Uses

Direct Use

from transfomers import AutoTokenizer
from transfomers import AutoModelForCausalLM

model_name_or_path = "anhdungitvn/vi-mistral-x"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)

text = "Ngân hàng lớn nhất Việt Nam là"
outputs = model(**tokenizer(text, return_tensor='pt'))

Downstream Use

SFT
from transfomers import AutoTokenizer
from transfomers import AutoModelForCausalLM

model_name_or_path = "anhdungitvn/vi-mistral-x"
tokenizer = AutoTokenizer.from_pretrained(
  model_name_or_path,
  model_max_length=4096  # customize it yourself
)
model = AutoModelForCausalLM.from_pretrained(
  model_name_or_path,
  attn_implementation='sdpa'  # customize it yourself
)


from peft import LoraConfig
from peft import PeftConfig
from peft import PeftModel
from peft import get_peft_model

config = LoraConfig(
  r=64,
  lora_alpha=128,
  lora_dropout=0.00,
  target_module=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]  # customize it yourself
)
model = get_peft_model(model, config)


from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling

dataset = load_dataset("anhdungitvn/vi-bfsi-sft-dummy-1k")  # customize it yourself

def process_function(examples):
  # customize it yourself
  return examples["text"]

dataset = dataset.map(
  process_function,
  batched=True,
  num_proc=64,
  remove_columns=dataset["train"].column_names,
  desc="Preprocessing"
)

data_collator = DataCollatorForLanguageModeling(
  tokenizer, mlm=False
) 


import evaluate

metric = evaluate.load("accuracy")
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    labels = labels[:, 1:].reshape(-1)
    preds = preds[:, :-1].reshape(-1)
    return metric.compute(
      predictions=preds,
      references=labels
    )

def preprocess_logits_for_metrics(logits, labels):
    if isinstance(logits, tuple):
        logits = logits[0]
    return logits.argmax(dim=-1)


from transformers import TrainingArguments
from transformers import Trainer

args = TrainingArguments(
    output_dir="output_dir",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=1,
    num_train_epochs=1,
    max_steps=10, # for test
    warmup_ratio=0.01,  # warmup_steps
    lr_scheduler_type='cosine', # cosine (fast), linear (stable), constant (test)
    learning_rate=0.000010,  # 10µ
    weight_decay=0.0001,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": True},  
    fp16=False,
    bf16=True,
    remove_unused_columns=False,
    dataloader_drop_last=True,    
    logging_strategy="steps",
    logging_steps=logging_steps,    
    save_strategy="steps",
    save_steps=save_steps,
    save_total_limit=1,
    evaluation_strategy="steps",
    eval_steps=eval_steps,
    push_to_hub=False,
    # hub_private_repo=True,
    # hub_model_id="anhdungitvn/my-model",
    # hub_token=hf_token_write,
    run_name='vi-mistral-x-sft'
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics
)

model.config.use_cache = False
trainer.train()
DPO
from datasets import load_dataset

dataset = load_dataset("anhdungitvn/vi-bfsi-dpo-dummy-1k")  # customize it yourself

def process_function(examples):
  # customize it yourself
  return examples["text"]


from trl import DPOTrainer as 


trainer = Trainer(
  model,
  model_ref,
  **kwargs
)
KTO
from datasets import load_dataset

dataset = load_dataset("anhdungitvn/vi-bfsi-kto-dummy-1k")  # customize it yourself

def process_function(examples):
  # customize it yourself
  return examples["text"]


from trl import KTOTrainer as Trainer
from trl import KTOConfig


args = KTOConfig(
    beta=0.1,
    desirable_weight=1.0,
    undesirable_weight=1.0,
)

trainer = Trainer(
  model,
  model_ref,
  args=args,
  **kwargs
)
QLoRA
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    llm_int8_threshold=6.0,
    llm_int8_has_fp16_weight=False,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

model = AutoModelForCausalLM.from_pretrained(
    "anhdungitvn/vi-mistral-x",
    load_in_4bit=True,  # GPU Quantization
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,  # GPU
)
Evaluation on VMLU

VMLU: https://vmlu.ai

from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM

model_name_or_path = "anhdungitvn/vi-mistral-x"

tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path,
    padding_side = "left"
) 
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    torch_dtype=torch.float16
)

class InferModel:
    def __init__(self, model, tokenizer, **kwargs):
        self.model = model
        self.tokenizer = tokenizer

    def __call__(self, prompt, **kwargs):
        outputs = self.model.generate(**self.tokenizer(prompt, return_tensors="pt"))
        text = self.tokenizer.batch_decode(outputs)[0]
        text = text.split("Đáp án: ")[-1].lstrip()
        return text

infer = InferModel(model=model, tokenizer=tokenizer)


from datasets import load_dataset

dataset_name_or_path = "anhdungitvn/vmlu_v1.5"

dataset = load_dataset(dataset_name_or_path)
test_dataset = dataset["test"]


import re
import os
import pandas as pd
from tqdm.auto import tqdm

all_res = []

with tqdm(total=len(test_dataset)) as pbar:
    for example in test_dataset:
        answer = infer(example["prompt"])
        all_res.append({
            "id": example['id'],
            "prompt": example["prompt"],
            "question": example["question"],
            "answer": answer
        })
        pbar.update(1)

df = pd.DataFrame(all_res)
df['answer'] = df.answer.map(lambda x: x[0].lower())
df['answer'] = df['answer'].map(lambda x: re.sub(r'[^abcde]', '', x))
submission_csv = df[['id', 'answer']].to_csv('submission.csv', index=None)

Training

Hyperparams:

  • Learning rate: 20μ, 5μ (PT); 50μ, 20μ (SFT)
  • per_device_train_batch_size: 4

Progress: ⬛⬛⬜⬜⬜⬜⬜⬜⬜⬜ 16%

  • Epoch: 0/1
  • Steps: 10000/63500
  • Tokens: 1310720000/8323137536
  • Data: ~3GB/20GB

This session is being updated...

Alignment

SFT_1

SFT_2

This session is being updated...

Evaluation

VNLU

Pretrained Model
| #  | MODEL          | CREATOR       | ACCESS | EVALUATION DATE | STEM  | SOCIAL SCIENCE | HUMANITIES | OTHERS | AVG   |
|----|----------------|---------------|--------|-----------------|-------|----------------|------------|--------|-------|
| 1  | GPT-4          | OpenAI        | API    | 08/01/2024      | 63.84 | 71.78          | 66.14      | 60.37  | 65.53 |
| 2  | gemini         | Google        | API    | 30/01/2024      | 42.8  | 60.31          | 55.35      | 51.30  | 51.03 |
| 3  | ChatGPT        | OpenAI        | API    | 08/01/2024      | 43.24 | 51.67          | 46.96      | 46.32  | 46.33 |
| 4  | ViGPT-1.6B-v1  | Vin BigData   | Private| 08/01/2024      | 35.06 | 48.72          | 47.20      | 42.54  | 42.34 |
| 5  | gemma-7b-it    | Google        | Weight | 22/02/2024      | 39.95 | 44.93          | 43.39      | 40.11  | 41.9  |
| 6  | Qwen-7B        | Alibaba Cloud | Weight | 08/01/2024      | 30.64 | 35.07          | 34.15      | 32.68  | 32.81 |
| 7  | vi-mistral-x   | James         | Private| 10/04/2024      | 24.88 | 34.08          | 35.11      | 29.26  | 30.32 |
| 8  | gemma-2b-it    | Google        | Weight | 22/02/2024      | 24.39 | 29.59          | 31.01      | 26.81  | 27.72 |
| 9  | sealion7b      | AI Singapore  | Weight | 08/01/2024      | 26.28 | 28.57          | 27.66      | 27.34  | 26.73 |
| 10 | bloom-1b7      | BigScience    | Weight | 08/01/2024      | 25.13 | 25.09          | 26.34      | 25.19  | 25.51 |
Aligned Model
| #  | MODEL                  | CREATOR               | ACCESS | BASE MODEL          | EVALUATION DATE | STEM  | SOCIAL SCIENCE | HUMANITIES | OTHERS | AVG   |
|----|------------------------|-----------------------|--------|---------------------|-----------------|-------|----------------|------------|--------|-------|
| 1  | VNPTAI.IO-14B          | VNPT AI               | Private| Qwen1.5-14B-Chat    | 11/03/2024      | 51.64 | 61.75          | 58.09      | 54.51  | 55.83 |
| 2  | Vistral-7B-Chat        | UONLP x Ontocord      | Weight | Mistral-7B-v0.1     | 16/01/2024      | 43.32 | 57.02          | 55.12      | 48.01  | 50.07 |
| 3  | SeaLLM-7b-v2           | DAMO Academy          | Weight | llama-2-7b          | 15/02/2024      | 39.95 | 52.02          | 49.38      | 45.27  | 45.79 |
| 4  | vi-mistral-x           | James                 | Private| vi-mistral-x        | 10/04/2024      | 31.13 | 48.81          | 48.36      | 40.44  | 40.97 |
| 5  | bloomz-7b1             | BigScience            | Weight | Bloom-7b1           | 08/01/2024      | 32.63 | 45.73          | 41.85      | 39.89  | 38.87 |
| 6  | T-Llama                | FPTU HCM              | Weight | llama-2-7b          | 18/03/2024      | 32.2  | 43.15          | 40.31      | 36.57  | 37.28 |
| 7  | vbd-llama2-7b-50b-chat | Vin BigData           | Weight | llama-2-7b          | 08/01/2024      | 31.45 | 40.34          | 40.24      | 39.62  | 36.98 |
| 8  | vietcuna-3b            | Virtual Interactive   | Weight | bloomz-3b           | 08/01/2024      | 30.12 | 39.92          | 37.86      | 33.83  | 34.79 |
| 9  | bloomz-1b7             | BigScience            | Weight | Bloom-1b7           | 08/01/2024      | 29.72 | 40.17          | 34.73      | 33.41  | 33.65 |
| 10 | SeaLLM-7B-Hybrid       | DAMO Academy          | Weight | llama-2-7b          | 08/01/2024      | 29.49 | 34.61          | 36.68      | 34.52  | 33.39 |
|category_subcategory                                            |score_PT|score_SFT|diff  |
|----------------------------------------------------------------|--------|---------|------|
|total                                                           |30.32   |40.97    |10.65 |
|stem_applied_informatics                                        |39.44   |30.36    |-9.08 |
|stem_computer_architecture                                      |31.11   |39.77    |8.66  |
|stem_computer_network                                           |34.64   |36.67    |2.03  |
|stem_discrete_mathematics                                       |23.64   |61.99    |38.35 |
|stem_electrical_engineering                                     |22.73   |33.33    |10.6  |
|stem_elementary_mathematics                                     |19.44   |39.18    |19.74 |
|stem_elementary_science                                         |55.0    |52.94    |-2.06 |
|stem_high_school_biology                                        |15.0    |24.14    |9.14  |
|stem_high_school_chemistry                                      |22.78   |45.61    |22.83 |
|stem_high_school_mathematics                                    |16.22   |40.44    |24.22 |
|stem_high_school_physics                                        |23.33   |37.78    |14.45 |
|stem_introduction_to_chemistry                                  |14.53   |71.91    |57.38 |
|stem_introduction_to_physics                                    |23.12   |61.22    |38.1  |
|stem_introduction_to_programming                                |29.05   |40.56    |11.51 |
|stem_metrology_engineer                                         |22.7    |51.69    |28.99 |
|stem_middle_school_biology                                      |31.18   |48.81    |17.63 |
|stem_middle_school_chemistry                                    |18.33   |33.91    |15.58 |
|stem_middle_school_mathematics                                  |17.59   |60.56    |42.97 |
|stem_middle_school_physics                                      |21.67   |46.3     |24.63 |
|stem_operating_system                                           |30.56   |45.25    |14.69 |
|stem_statistics_and_probability                                 |10.34   |38.89    |28.55 |
|stem_total                                                      |24.88   |41.67    |16.79 |
|other_clinical_pharmacology                                     |26.11   |30.56    |4.45  |
|other_driving_license_certificate                               |45.61   |34.64    |-10.97|
|other_environmental_engineering                                 |11.7    |20.61    |8.91  |
|other_internal_basic_medicine                                   |34.5    |30.11    |-4.39 |
|other_preschool_pedagogy                                        |34.31   |10.56    |-23.75|
|other_tax_accountant                                            |20.69   |68.33    |47.64 |
|other_tax_civil_servant                                         |41.52   |26.11    |-15.41|
|other_total                                                     |29.26   |32.22    |2.96  |
|other_accountant                                                |21.43   |22.97    |1.54  |
|other_civil_servant                                             |27.49   |24.44    |-3.05 |
|humanity_economic_law                                           |29.81   |26.82    |-2.99 |
|humanity_education_law                                          |33.13   |26.01    |-7.12 |
|humanity_elementary_history                                     |49.72   |30.17    |-19.55|
|humanity_high_school_history                                    |31.11   |37.59    |6.48  |
|humanity_high_school_literature                                 |25.56   |46.47    |20.91 |
|humanity_history_of_world_civilization                          |41.11   |27.78    |-13.33|
|humanity_idealogical_and_moral_cultivation                      |49.44   |16.67    |-32.77|
|humanity_introduction_to_laws                                   |39.68   |24.44    |-15.24|
|humanity_introduction_to_vietnam_culture                        |28.33   |41.11    |12.78 |
|humanity_logic                                                  |18.97   |34.48    |15.51 |
|humanity_middle_school_history                                  |37.78   |31.13    |-6.65 |
|humanity_middle_school_literature                               |37.36   |45.81    |8.45  |
|humanity_revolutionary_policy_of_the_vietnamese_commununist_part|36.67   |53.33    |16.66 |
|humanity_vietnamese_language_and_literature                     |17.24   |57.06    |39.82 |
|humanity_total                                                  |35.11   |37.27    |2.16  |
|humanity_administrative_law                                     |37.78   |43.37    |5.59  |
|humanity_business_law                                           |39.11   |53.11    |14.0  |
|humanity_civil_law                                              |41.11   |44.44    |3.33  |
|humanity_criminal_law                                           |38.04   |43.89    |5.85  |
|social_science_middle_school_geography                          |27.21   |58.33    |31.12 |
|social_science_principles_of_marxism_and_leninism               |36.67   |61.67    |25.0  |
|social_science_sociology                                        |39.89   |47.62    |7.73  |
|social_science_business_administration                          |20.69   |47.78    |27.09 |
|social_science_high_school_civil_education                      |43.89   |27.59    |-16.3 |
|social_science_high_school_geography                            |33.33   |56.11    |22.78 |
|social_science_ho_chi_minh_ideology                             |41.34   |55.17    |13.83 |
|social_science_macroeconomics                                   |21.67   |50.0     |28.33 |
|social_science_microeconomics                                   |23.89   |39.08    |15.19 |
|social_science_middle_school_civil_education                    |52.25   |48.36    |-3.89 |
|social_science_total                                            |34.08   |48.89    |14.81 |

This session is being updated...

```

Downloads last month
0
Unable to determine this model's library. Check the docs .