--- tags: - autotrain - text-generation - meta-llama - meta-llama/Llama-2-7b-hf inference: true widget: - text: > instruction: "If you are a doctor, please answer the medical questions based on the patient's description." input: "Hi, I had a subarachnoid bleed and coiling of brain aneurysm last year. I am having some major bilateral temple pain along with numbness that comes and goes in my left arm/hand/fingers. I have had headaches since the aneurysm, but this is different. Also, my moods have been horrible for the past few weeks." response: '' library_name: peft --- ```python !huggingface-cli login _| _| _| _| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _|_|_|_| _|_| _|_|_| _|_|_|_| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _| _|_|_|_| _| _| _| _|_| _| _|_| _| _| _| _| _| _|_| _|_|_| _|_|_|_| _| _|_|_| _| _| _| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _| _| _| _|_| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _| _| _| _|_|_| _|_|_|_| To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens . Token: ``` ```python from peft import PeftModel, PeftConfig from transformers import AutoModelForCausalLM from transformers import AutoTokenizer import torch device = torch.device("cuda" if torch.cuda.is_available() else "cpu") config = PeftConfig.from_pretrained("Ashishkr/llama2_medical_consultation") model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf") model = PeftModel.from_pretrained(model, "Ashishkr/llama2_medical_consultation").to(device) tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") ``` ```python def llama_generate( model: AutoModelForCausalLM, tokenizer: AutoTokenizer, prompt: str, max_new_tokens: int = 128, temperature: float = 0.92): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") inputs = tokenizer( [prompt], return_tensors="pt", return_token_type_ids=False, ).to( device ) with torch.autocast("cuda", dtype=torch.bfloat16): response = model.generate( **inputs, max_new_tokens=max_new_tokens, temperature=temperature, return_dict_in_generate=True, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, ) decoded_output = tokenizer.decode( response["sequences"][0], skip_special_tokens=True, ) return decoded_output[len(prompt) :] prompt = """ instruction: "If you are a doctor, please answer the medical questions based on the patient's description.", input: "My baby has been pooing 5-6 times a day for a week. In the last few days it has increased to 7 and they are very watery with green stringy bits in them. He does not seem unwell i.e no temperature and still eating. He now has a very bad nappy rash from the pooing ...help!" .\n response: """ response = llama_generate( model, tokenizer, prompt, max_new_tokens=100, temperature=0.92, ) print(response) ```