--- base_model: cognitivecomputations/dolphin-2.9-llama3-8b language: - en license: apache-2.0 tags: - text-generation-inference - transformers - unsloth - llama - trl - sft --- # Uploaded model - **Developed by:** AashishKumar - **License:** apache-2.0 - **Finetuned from model :** cognitivecomputations/dolphin-2.9-llama3-8b ```python from unsloth.chat_templates import get_chat_template # Assuming you've initialized your tokenizer and model tokenizer = get_chat_template( tokenizer, chat_template="chatml", # Adjust as per your template needs mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"}, map_eos_token=True, ) FastLanguageModel.for_inference(model) # Ensure model is optimized for inference messages = [ {"from": "system", "value": "you are assistant designed to talk to answer any user question like a normal human would. Make sure any names are in english"}, {"from": "human", "value": "mujhe kuch acchi movies recommend kro"} # Example Hinglish input ] inputs = tokenizer.apply_chat_template( messages, truncation=True, tokenize=True, add_generation_prompt=True, return_tensors="pt", ).to("cuda") outputs = model.generate( input_ids=inputs, max_new_tokens=64, do_sample=True, temperature=0.7, top_k=50, top_p=0.95, use_cache=True, no_repeat_ngram_size=3, num_return_sequences=1 ) decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] print(decoded_outputs) # Adjust how you handle outputs based on your application needs ```