import streamlit as st #LLAMA prep from huggingface_hub import login import torch import transformers from transformers import AutoTokenizer, AutoModelForCausalLM from langchain import HuggingFacePipeline from langchain import PromptTemplate, LLMChain @st.cache_resource def load_llm(): global pipe, llm login("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") tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-13b-chat-hf", use_auth_token=True,) model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b-chat-hf", device_map='auto', torch_dtype=torch.float16, use_auth_token=True, # load_in_8bit=True, # load_in_4bit=True ) # Use a pipeline for later from transformers import pipeline pipe = pipeline("text-generation", model=model, tokenizer= tokenizer, torch_dtype=torch.bfloat16, device_map="auto", max_new_tokens = 512, do_sample=True, top_k=30, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id ) llm = HuggingFacePipeline(pipeline = pipe, model_kwargs = {'temperature':0}) return pipe, llm pipe, llm = load_llm() import json import textwrap B_INST, E_INST = "[INST]", "[/INST]" B_SYS, E_SYS = "<>\n", "\n<>\n\n" DEFAULT_SYSTEM_PROMPT = """\ You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""" def get_prompt(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT ): SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS prompt_template = B_INST + SYSTEM_PROMPT + instruction + E_INST return prompt_template def cut_off_text(text, prompt): cutoff_phrase = prompt index = text.find(cutoff_phrase) if index != -1: return text[:index] else: return text def remove_substring(string, substring): return string.replace(substring, "") def generate(text): prompt = get_prompt(text) with torch.autocast('cuda', dtype=torch.bfloat16): inputs = tokenizer(prompt, return_tensors="pt").to('cuda') outputs = model.generate(**inputs, max_new_tokens=512, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id, ) final_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] final_outputs = cut_off_text(final_outputs, '') final_outputs = remove_substring(final_outputs, prompt) return final_outputs#, outputs def parse_text(text): wrapped_text = textwrap.fill(text, width=100) print(wrapped_text +'\n\n') # return assistant_text return wrapped_text def answer(context, question): global llm instruction = f"conversation: '''{context}'''"+"\n based on the provided conversation in triple quotes answer next question.\n Question: {text}" system_prompt = "You are an expert and answer any question based on conversation. You analys the conversation in light of the question then you answer with yes, no or not clear only. You only output one or two words" template = get_prompt(instruction, system_prompt) print(template) prompt = PromptTemplate(template=template, input_variables=["text"]) llm_chain = LLMChain(prompt=prompt, llm=llm) output = llm_chain.run(question) return parse_text(output) question = st.sidebar.text_input('Question', 'Can she answer') context = st.text_area('Context', 'conversation') if st.sidebar.button('Answer'): outputs = "none" outputs = answer(context, question) st.sidebar.write(f"Answer is {outputs}")