Spaces:

Mohannad
/

yes-no

Paused

App Files Files Community

yes-no / app.py

Mohannad

Update app.py

00a0b15 11 months ago

raw

history blame

No virus

4.6 kB

	import streamlit as st
	#LLAMA prep
	from huggingface_hub import login
	import torch
	import transformers
	from transformers import AutoTokenizer, AutoModelForCausalLM
	from langchain import HuggingFacePipeline
	from langchain import PromptTemplate, LLMChain

	@st.cache_resource
	def load_llm():
	global pipe, llm
	login("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")



	tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-13b-chat-hf",
	use_auth_token=True,)

	model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b-chat-hf",
	device_map='auto',
	torch_dtype=torch.float16,
	use_auth_token=True,
	# load_in_8bit=True,
	# load_in_4bit=True
	)
	# Use a pipeline for later
	from transformers import pipeline

	pipe = pipeline("text-generation",
	model=model,
	tokenizer= tokenizer,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	max_new_tokens = 512,
	do_sample=True,
	top_k=30,
	num_return_sequences=1,
	eos_token_id=tokenizer.eos_token_id
	)

	llm = HuggingFacePipeline(pipeline = pipe, model_kwargs = {'temperature':0})

	return pipe, llm

	pipe, llm = load_llm()

	import json
	import textwrap

	B_INST, E_INST = "[INST]", "[/INST]"
	B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
	DEFAULT_SYSTEM_PROMPT = """\
	You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

	If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""



	def get_prompt(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT ):
	SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
	prompt_template = B_INST + SYSTEM_PROMPT + instruction + E_INST
	return prompt_template

	def cut_off_text(text, prompt):
	cutoff_phrase = prompt
	index = text.find(cutoff_phrase)
	if index != -1:
	return text[:index]
	else:
	return text

	def remove_substring(string, substring):
	return string.replace(substring, "")



	def generate(text):
	prompt = get_prompt(text)
	with torch.autocast('cuda', dtype=torch.bfloat16):
	inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
	outputs = model.generate(**inputs,
	max_new_tokens=512,
	eos_token_id=tokenizer.eos_token_id,
	pad_token_id=tokenizer.eos_token_id,
	)
	final_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
	final_outputs = cut_off_text(final_outputs, '</s>')
	final_outputs = remove_substring(final_outputs, prompt)

	return final_outputs#, outputs

	def parse_text(text):
	wrapped_text = textwrap.fill(text, width=100)
	print(wrapped_text +'\n\n')
	# return assistant_text
	return wrapped_text



	def answer(context, question):
	global llm
	instruction = f"conversation: '''{context}'''"+"\n based on the provided conversation in triple quotes answer next question.\n Question: {text}"

	system_prompt = "You are an expert and answer any question based on conversation. You analys the conversation in light of the question then you answer with yes, no or not clear only. You only output one or two words"

	template = get_prompt(instruction, system_prompt)
	print(template)

	prompt = PromptTemplate(template=template, input_variables=["text"])
	llm_chain = LLMChain(prompt=prompt, llm=llm)
	output = llm_chain.run(question)

	return parse_text(output)




	question = st.sidebar.text_input('Question', 'Can she answer')
	context = st.text_area('Context', 'conversation')

	if st.sidebar.button('Answer'):
	outputs = "none"
	outputs = answer(context, question)
	st.sidebar.write(f"Answer is {outputs}")