import streamlit as st from transformers import AutoModelForCausalLM, AutoTokenizer,pipeline model_id = "meta-llama/Llama-2-7b-chat-hf" model_id = str(st.text_input("Enter model_id")) tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, load_in_4bit=True, #attn_implementation="flash_attention_2", # if you have an ampere GPU ) max_new_tokens=100 top_k=50 temperature=0.1 max_new_tokens = st.text_input("Enter max_new_tokens") top_k = st.text_input("Enter max_new_tokens") temperature = st.text_input("Enter temperature") query = st.chat_input("Enter your query") st.write(query) pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=100, top_k=50, temperature=0.1) llm = HuggingFacePipeline(pipeline=pipe) st.write(llm.invoke(query))