| | from datasets import Dataset
|
| | from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
|
| | from peft import LoraConfig, get_peft_model, PeftModel
|
| |
|
| | raw_data_path = ""
|
| | with open(raw_data_path, "r", encoding="utf-8") as f:
|
| | raw_lines = f.readlines()
|
| |
|
| | def process_line(line):
|
| | segments = line.strip().split("/")
|
| | return "/".join(segments[:-1]) if len(segments) > 1 else line.strip()
|
| |
|
| | processed_samples = [process_line(line) for line in raw_lines if line.strip()]
|
| | dataset = Dataset.from_dict({"text": processed_samples})
|
| |
|
| | model_name = ""
|
| | tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| | model = AutoModelForCausalLM.from_pretrained(model_name)
|
| |
|
| | lora_config = LoraConfig(
|
| | r=8,
|
| | lora_alpha=32,
|
| | target_modules=["q_proj", "v_proj"],
|
| | lora_dropout=0.1,
|
| | bias="none",
|
| | task_type="CAUSAL_LM"
|
| | )
|
| | model = get_peft_model(model, lora_config)
|
| |
|
| | def tokenize_function(examples):
|
| |
|
| | prompt = "根据以下关键词生成一首歌词,歌词中包含多个句子,句子与句子之间使用/隔开,让我们一步一步的思考(思考过程包含在<think>和</think>之间):"
|
| |
|
| |
|
| | modified_texts = [prompt + text for text in examples["text"]]
|
| |
|
| |
|
| | tokenized = tokenizer(modified_texts, truncation=True, padding="max_length", max_length=256)
|
| |
|
| |
|
| | tokenized["labels"] = tokenized["input_ids"].copy()
|
| |
|
| | return tokenized
|
| |
|
| |
|
| |
|
| |
|
| | tokenized_dataset = dataset.map(tokenize_function, batched=True)
|
| |
|
| | training_args = TrainingArguments(
|
| | output_dir="./lora",
|
| | num_train_epochs=8,
|
| | per_device_train_batch_size=10,
|
| | learning_rate=2e-5,
|
| | weight_decay=0.01,
|
| | logging_steps=10000,
|
| | save_steps=15000,
|
| | fp16=True,
|
| | )
|
| |
|
| | trainer = Trainer(
|
| | model=model,
|
| | args=training_args,
|
| | train_dataset=tokenized_dataset,
|
| | tokenizer=tokenizer,
|
| | )
|
| |
|
| |
|
| | trainer.train()
|
| |
|
| |
|
| | generation_config = {
|
| | "max_new_tokens": 1024,
|
| | "temperature": 1.0,
|
| | "top_p": 0.9,
|
| | "top_k": 40,
|
| | "repetition_penalty": 1.2,
|
| | "do_sample": True,
|
| | "encoder_no_repeat_ngram_size": 4,
|
| | }
|
| | if True:
|
| | prompt = "根据以下关键词生成一首歌词,歌词中包含多个句子,句子与句子之间使用/隔开,让我们一步一步的思考(思考过程包含在<think>和</think>之间):温柔,轮廓,洒脱:"
|
| | input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
|
| | outputs = model.generate(input_ids, **generation_config)
|
| | decoded = tokenizer.decode(outputs[0], skip_special_tokens=False)
|
| |
|
| | print(decoded)
|
| |
|
| | model.save_pretrained("") |