yes-no / app.py
Mohannad's picture
Update app.py
00a0b15
raw
history blame
No virus
4.6 kB
import streamlit as st
#LLAMA prep
from huggingface_hub import login
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain
@st.cache_resource
def load_llm():
global pipe, llm
login("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")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-13b-chat-hf",
use_auth_token=True,)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b-chat-hf",
device_map='auto',
torch_dtype=torch.float16,
use_auth_token=True,
# load_in_8bit=True,
# load_in_4bit=True
)
# Use a pipeline for later
from transformers import pipeline
pipe = pipeline("text-generation",
model=model,
tokenizer= tokenizer,
torch_dtype=torch.bfloat16,
device_map="auto",
max_new_tokens = 512,
do_sample=True,
top_k=30,
num_return_sequences=1,
eos_token_id=tokenizer.eos_token_id
)
llm = HuggingFacePipeline(pipeline = pipe, model_kwargs = {'temperature':0})
return pipe, llm
pipe, llm = load_llm()
import json
import textwrap
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
DEFAULT_SYSTEM_PROMPT = """\
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
def get_prompt(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT ):
SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
prompt_template = B_INST + SYSTEM_PROMPT + instruction + E_INST
return prompt_template
def cut_off_text(text, prompt):
cutoff_phrase = prompt
index = text.find(cutoff_phrase)
if index != -1:
return text[:index]
else:
return text
def remove_substring(string, substring):
return string.replace(substring, "")
def generate(text):
prompt = get_prompt(text)
with torch.autocast('cuda', dtype=torch.bfloat16):
inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
outputs = model.generate(**inputs,
max_new_tokens=512,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.eos_token_id,
)
final_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
final_outputs = cut_off_text(final_outputs, '</s>')
final_outputs = remove_substring(final_outputs, prompt)
return final_outputs#, outputs
def parse_text(text):
wrapped_text = textwrap.fill(text, width=100)
print(wrapped_text +'\n\n')
# return assistant_text
return wrapped_text
def answer(context, question):
global llm
instruction = f"conversation: '''{context}'''"+"\n based on the provided conversation in triple quotes answer next question.\n Question: {text}"
system_prompt = "You are an expert and answer any question based on conversation. You analys the conversation in light of the question then you answer with yes, no or not clear only. You only output one or two words"
template = get_prompt(instruction, system_prompt)
print(template)
prompt = PromptTemplate(template=template, input_variables=["text"])
llm_chain = LLMChain(prompt=prompt, llm=llm)
output = llm_chain.run(question)
return parse_text(output)
question = st.sidebar.text_input('Question', 'Can she answer')
context = st.text_area('Context', 'conversation')
if st.sidebar.button('Answer'):
outputs = "none"
outputs = answer(context, question)
st.sidebar.write(f"Answer is {outputs}")