Spaces:

316usman
/

langchain_llama

Paused

App Files Files Community

langchain_llama / app.py

316usman

Update app.py

8d6850c over 1 year ago

raw

history blame contribute delete

5.26 kB

	import streamlit as st
	import langchain
	import pinecone
	import transformers
	import pinecone
	import accelerate
	from torch import cuda, bfloat16
	from transformers import pipeline

	from langchain.vectorstores import Chroma, Pinecone
	from langchain.embeddings import CohereEmbeddings
	from langchain.llms import HuggingFacePipeline
	from langchain import LLMChain, PromptTemplate
	from transformers import LlamaForCausalLM, LlamaTokenizer



	st.title("Language Model Chain")
	PINECONE_API_KEY = '80414b32-6e4f-40d5-aa3e-f9d09535006c'
	PINECONE_API_ENV = 'northamerica-northeast1-gcp'
	cohere_api_key = 'VQBpxCtpSiu3PLUyBBkNIdyQaM5qM8svfmnD3L4h'
	pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV)
	index_name = "langchain"
	embeddings = CohereEmbeddings(cohere_api_key=cohere_api_key)
	index = pinecone.Index("langchain")
	print ("Program Started")
	# selected_model = st.selectbox("Select Model", ["decapoda-research/llama-7b-hf", "chainyo/alpaca-lora-7b"])

	# # Display the selected model
	# st.write("Selected Model:", selected_model)

	model_loaded = False
	model = None
	repo_id="decapoda-research/llama-7b-hf"

	@st.cache(allow_output_mutation=True)
	def load_model():
	config = transformers.AutoConfig.from_pretrained(repo_id)
	with accelerate.init_empty_weights():
	fake_model = transformers.AutoModelForCausalLM.from_config(config)
	device_map = accelerate.infer_auto_device_map(fake_model)
	model = transformers.LlamaForCausalLM.from_pretrained(
	repo_id,
	device_map="auto",
	load_in_8bit=True,
	cache_dir="./cache",
	)
	tokenizer = LlamaTokenizer.from_pretrained(repo_id)
	return model, tokenizer

	print ("Model Loaded")
	# Initialize session state variables
	if "model_loaded" not in st.session_state:
	st.session_state["model_loaded"] = False
	if "model" not in st.session_state:
	st.session_state["model"] = None
	if "tokenizer" not in st.session_state:
	st.session_state["tokenizer"] = None

	# Display the "Load Model" button
	if not st.session_state["model_loaded"]:
	if st.button("Load Model"):
	model1, tokenizer1 = load_model()
	st.session_state["model"] = model1
	st.session_state["tokenizer"] = tokenizer1
	st.session_state["model_loaded"] = True
	else:
	model1 = st.session_state["model"]
	tokenizer1 = st.session_state["tokenizer"]


	if st.session_state["model_loaded"]:
	# Set up initial values for pipeline parameters
	temperature = st.slider("Temperature 'randomness' of outputs, 0.0 is the min and 1.0 the max", min_value=0.0, max_value=1.0, value=0.1, step=0.1)
	top_p = st.slider("Top P select from top tokens whose probability add up to 15%", min_value=0.0, max_value=1.0, value=0.1, step=0.1)
	top_k = st.slider("Top K select from top 0 tokens (because zero, relies on top_p)", min_value=0, max_value=100, value=20, step=1)
	max_new_tokens = st.slider("Max New Tokens max number of tokens to generate in the output", min_value=0, max_value=512, value=256, step=1)
	repetition_penalty = st.slider("Repetition Penalty without this output begins repeating", min_value=0.0, max_value=2.0)
	#Number of retrieved documents
	num_of_docs = st.selectbox("Number of Options", range(2, 11), index=0)

	query = st.text_area("Query Text", height=150)
	show_documents = st.checkbox("Show Retrieved Documents")
	# Set-up the Template
	template = """Given the question "{instruction}" and it's relevant answers as "{answers}", summarize the answers in context of the question"""
	prompt = PromptTemplate(input_variables=["instruction","answers"], template=template)


	if st.button("Generate Text"):
	#Call the pipeline and display the generated text
	generate_text = pipeline(
	model=model1, tokenizer=tokenizer1,
	return_full_text=True, # langchain expects the full text
	task='text-generation',
	#device=device
	# we pass model parameters here too
	#stopping_criteria=stopping_criteria, # without this model will ramble
	temperature=temperature, # 'randomness' of outputs, 0.0 is the min and 1.0 the max
	top_p=top_p, # select from top tokens whose probability add up to 15%
	top_k=top_k, # select from top 0 tokens (because zero, relies on top_p)
	max_new_tokens=max_new_tokens, # mex number of tokens to generate in the output
	repetition_penalty=repetition_penalty # without this output begins repeating
	)


	llm = HuggingFacePipeline(pipeline=generate_text)
	llm_chain = LLMChain(llm=llm, prompt=prompt)

	print ("Inside Function")
	query_vector = embeddings.embed_query(query)
	query_response = index.query(top_k=num_of_docs, include_metadata=True, vector=query_vector)
	docs=[]
	for result in query_response['matches']:
	docs.append(result['metadata']['text'])
	answers= ' '.join(docs)
	if show_documents:
	st.text_area("Retrieved Vectors", answers)
	text = (llm_chain.predict(instruction=query, answers=answers))

	st.text_area("Result",text)
	cuda.empty_cache()
	cuda.empty_cache()
	cuda.empty_cache()
	cuda.empty_cache()