#Loading the HF_TOKEN from the .env file from dotenv import load_dotenv load_dotenv() import transformers import torch from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM #Loading llama3 model local_model_path = "meta-llama/Meta-Llama-3-8B-Instruct" model = transformers.AutoModelForCausalLM.from_pretrained(local_model_path, torch_dtype=torch.bfloat16) tokenizer = AutoTokenizer.from_pretrained(local_model_path, padding_side='left') # Set up the pipeline pipeline = transformers.pipeline( "text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1 # Use GPU if available ) def chat_function(message, history, system_prompt,max_new_tokens,temperature): messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": message}, ] prompt = pipeline.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) terminators = [ pipeline.tokenizer.eos_token_id, pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>") ] temp = temperature + 0.1 outputs = pipeline( prompt, max_new_tokens=max_new_tokens, eos_token_id=terminators, do_sample=True, temperature=temp, top_p=0.9, ) return outputs[0]["generated_text"][len(prompt):] message = "Hello, can you teach me past simple?" history = [("Hi!", "I'm doing well, thanks for asking!")] temperature = 0.7 max_new_tokens = 50 prompt = "Act as an english tutor. Always correct grammar and spelling mistakes. Always keep the conversation going by asking follow up questions" response = chat_function(message=message, history= history, system_prompt= prompt, max_new_tokens= max_new_tokens, temperature= temperature) print(response)