--- license: apache-2.0 datasets: - kinokokoro/ichikara-instruction-003 language: - ja base_model: - llm-jp/llm-jp-3-13b library_name: transformers tags: - text-generation-inference - transformers --- # Sample Use ```python MODEL_DIR = os.path.join("model_dir") def load_model(): print("モデルとトークナイザーを読み込み中...") tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR) model = AutoModelForCausalLM.from_pretrained( MODEL_DIR, torch_dtype=torch.float16, device_map={"": 0}, # 明示的にGPU割り当て use_cache=True, # キャッシュを有効化 ).to('cuda') # 明示的にGPUへ model.eval() # 評価モード return model, tokenizer def generate_predictions(model, tokenizer, input_file, output_file): # バッチ処理の追加 BATCH_SIZE = 8 # バッチサイズの設定 print(f"入力ファイルを読み込み中: {input_file}") tasks = [] with open(input_file, 'r', encoding='utf-8') as f: for line in f: tasks.append(json.loads(line)) results = [] print("推論を実行中...") # バッチ処理 for i in tqdm(range(0, len(tasks), BATCH_SIZE)): batch_tasks = tasks[i:i + BATCH_SIZE] prompts = [f"入力: {task['input']}\n出力: " for task in batch_tasks] # バッチでの推論 inputs = tokenizer( prompts, return_tensors="pt", padding=True, truncation=True, max_length=512 ).to('cuda') with torch.no_grad(): outputs = model.generate( inputs.input_ids, max_length=512, temperature=0.7, do_sample=False, repetition_penalty=1.2, pad_token_id=tokenizer.pad_token_id, num_return_sequences=1, early_stopping=True, # 早期停止を有効化 use_cache=True # キャッシュを使用 ) # バッチ出力の処理 for j, output in enumerate(outputs): generated_text = tokenizer.decode(output, skip_special_tokens=True) output_text = generated_text.split("出力: ")[-1].strip() results.append({ "task_id": batch_tasks[j]["task_id"], "output": output_text }) print(f"結果を保存中: {output_file}") with open(output_file, 'w', encoding='utf-8') as f: for result in results: json.dump(result, f, ensure_ascii=False) f.write('\n') ```