# !pip install -q -U git+https://github.com/huggingface/accelerate.git # !pip install -q -U bitsandbytes # !pip install -q -U git+https://github.com/huggingface/transformers.git model_id = 'Huma97/main' import transformers from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline, TextStreamer # model = AutoModelForCausalLM.from_pretrained(model_id, device_map='auto') # tokenizer = AutoTokenizer.from_pretrained(model_id ,torch_dtype="auto") # #Llama 2 Inference # def stream(user_prompt): # system_prompt = 'You are a helpful assistant that provides accurate and concise responses' # B_INST, E_INST = "[INST]", "[/INST]" # B_SYS, E_SYS = "<>\n", "\n<>\n\n" # prompt = f"{B_INST} {B_SYS}{system_prompt.strip()}{E_SYS}{user_prompt.strip()} {E_INST}\n\n" # inputs = tokenizer([prompt], return_tensors="pt").to("cuda:0") # streamer = TextStreamer(tokenizer) # # Despite returning the usual output, the streamer will also print the generated text to stdout. # _ = model.generate(**inputs, streamer=streamer, max_new_tokens=500) import streamlit as st # st.write(stream('Count to ten')) # Load pre-trained LLaMA 2 model and tokenizer model = AutoModelForCausalLM.from_pretrained(model_id, device_map='auto') tokenizer = AutoTokenizer.from_pretrained(model_id ,torch_dtype="auto") # Create a Streamlit text input field input_text = st.text_input("Enter your prompt:") # Define a function to generate text using LLaMA 2 def generate_text(input_text): inputs = tokenizer.encode(input_text, return_tensors="pt") outputs = model.generate(inputs, max_length=250, num_return_sequences=1) return tokenizer.decode(outputs[0], skip_special_tokens=True) # Create a Streamlit button to trigger text generation if st.button("Generate Text"): output_text = generate_text(input_text) st.write(output_text)