Please remember: one sentence per line, no more than 256 characters per line. (I have very limited hardware) DO NOT USE FP16. Original model was trained using BF16, FP16 will give unexpected result!!!
Sample Code
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig
from tqdm import tqdm
from peft import PeftModelForSeq2SeqLM
import os
import torch
from datasets import load_from_disk as load
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)
model_to_load ='jetaudio/novel_zh2vi'
base_model = AutoModelForSeq2SeqLM.from_pretrained('google/madlad400-3b-mt',quantization_config=bnb_config,device_map='auto')
model = PeftModelForSeq2SeqLM.from_pretrained(base_model,model_to_load)
tokenizer = AutoTokenizer.from_pretrained(model_to_load)
text = open('luanhoilacvien_cn.txt','r',encoding='utf8').read() #luanhoilacvien_cn.txt is the sources file written in Chinese
def trans(texts,temp=1,top_p=0.8):
encodings = tokenizer(texts,return_tensors='pt',padding='max_length', max_length=256).to('cuda')
gens = model.generate(input_ids=encodings.input_ids,
do_sample=True,
max_length=256,
temperature=temp,
top_p=top_p
)
return '\n'.join([tokenizer.decode(gen,skip_special_tokens=True) for gen in gens])
text = text.replace(' ','').replace('\n\n','\n').split('\n')
batch_size = 32
texts = [text[i * batch_size:(i + 1) * batch_size] for i in range((len(text) + batch_size - 1) // batch_size)]
for sens in tqdm(texts):
t = trans(['<2vi>' + sen for sen in sens],temp=0.1,top_p=0.3)
with open('./luanhoilacvien_vi.txt','a', encoding='utf8') as fout: #luanhoilacvien_vi.txt is the result in Vietnamese
fout.write(t + '\n')
Framework versions
- PEFT 0.7.1
- Transformers 4.36.1
- Pytorch 2.1.2+cu121
- Datasets 2.15.0
- Tokenizers 0.15.0
- Downloads last month
- 0
Inference API (serverless) does not yet support peft models for this pipeline type.
Model tree for jetaudio/novel_zh2vi_madlad400_3b_v1
Base model
google/madlad400-3b-mt