Spaces:

316usman
/

langchain_llama

Paused

App Files Files Community

316usman commited on May 25, 2023

Commit

2876ad0

1 Parent(s): b058857

Create app.py

Browse files

Files changed (1) hide show

app.py +105 -0

app.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import streamlit as st
+import langchain
+import pinecone
+import transformers
+import pinecone
+import accelerate
+from torch import cuda, bfloat16
+from transformers import pipeline
+from langchain.vectorstores import Chroma, Pinecone
+from langchain.embeddings import CohereEmbeddings
+from langchain.llms import HuggingFacePipeline
+from langchain import LLMChain, PromptTemplate
+from transformers import LlamaForCausalLM, LlamaTokenizer
+st.title("Language Model Chain")
+PINECONE_API_KEY = '80414b32-6e4f-40d5-aa3e-f9d09535006c'
+PINECONE_API_ENV = 'northamerica-northeast1-gcp'
+cohere_api_key = 'VQBpxCtpSiu3PLUyBBkNIdyQaM5qM8svfmnD3L4h'
+pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV)
+index_name = "langchain"
+embeddings = CohereEmbeddings(cohere_api_key=cohere_api_key)
+index = pinecone.Index("langchain")
+print ("Program Started")
+# selected_model = st.selectbox("Select Model", ["decapoda-research/llama-7b-hf", "chainyo/alpaca-lora-7b"])
+# # Display the selected model
+# st.write("Selected Model:", selected_model)
+model_loaded = False
+model = None
+repo_id="decapoda-research/llama-7b-hf"
+@st.cache(allow_output_mutation=True)
+def load_model():
+    config = transformers.AutoConfig.from_pretrained(repo_id)
+    with accelerate.init_empty_weights():
+        fake_model = transformers.AutoModelForCausalLM.from_config(config)
+    device_map = accelerate.infer_auto_device_map(fake_model)
+    model = transformers.LlamaForCausalLM.from_pretrained(
+        repo_id,
+        load_in_8bit=True,
+        cache_dir="./cache",
+    )
+    tokenizer = LlamaTokenizer.from_pretrained(repo_id)
+    return model, tokenizer
+if st.button("Load Model"):
+    model1, tokenizer1 = load_model()
+    model_loaded = True
+print ("Model Loaded")
+if model_loaded:
+# Set up initial values for pipeline parameters
+    temperature = st.slider("Temperature  'randomness' of outputs, 0.0 is the min and 1.0 the max", min_value=0.0, max_value=1.0, value=0.1, step=0.1)
+    top_p = st.slider("Top P select from top tokens whose probability add up to 15%", min_value=0.0, max_value=1.0, value=0.1, step=0.1)
+    top_k = st.slider("Top K select from top 0 tokens (because zero, relies on top_p)", min_value=0, max_value=100, value=20, step=1)
+    max_new_tokens = st.slider("Max New Tokens  max number of tokens to generate in the output", min_value=0, max_value=512, value=256, step=1)
+    repetition_penalty = st.slider("Repetition Penalty without this output begins repeating", min_value=0.0, max_value=2.0)
+    #Number of retrieved documents
+    num_of_docs = st.selectbox("Number of Options", range(2, 11), index=0)
+    query = st.text_area("Query Text", height=150)
+    show_documents = st.checkbox("Show Retrieved Documents")
+    # Set-up the Template
+    template = """Given the question "{instruction}" and it's relevant answers as "{answers}", summarize the {answers} to answer the question"""
+    prompt = PromptTemplate(input_variables=["instruction","answers"], template=template)
+    if st.button("Generate Text"):
+        #Call the pipeline and display the generated text
+        generate_text = pipeline(
+        model=model1, tokenizer=tokenizer1,
+        return_full_text=True,  # langchain expects the full text
+        task='text-generation',
+        #device=device
+        # we pass model parameters here too
+        #stopping_criteria=stopping_criteria,  # without this model will ramble
+        temperature=temperature,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
+        top_p=top_p,  # select from top tokens whose probability add up to 15%
+        top_k=top_k,  # select from top 0 tokens (because zero, relies on top_p)
+        max_new_tokens=max_new_token,  # mex number of tokens to generate in the output
+        repetition_penalty=repetition_penalty  # without this output begins repeating
+        )
+        llm = HuggingFacePipeline(pipeline=generate_text)
+        llm_chain = LLMChain(llm=llm, prompt=prompt)
+        print ("Inside Function")
+        query_vector = embeddings.embed_query(query)
+        query_response = index.query(top_k=num_of_docs, include_metadata=True, vector=query_vector)
+        docs=[]
+        for result in query_response['matches']:
+            docs.append(result['metadata']['text'])
+        answers= ' '.join(docs)
+        if show_documents:
+              st.text_area("Retrieved Vectors", answers)
+        text = (llm_chain.predict(instruction=query, answers=answers))
+        st.text_area("Result",text)
+        cuda.empty_cache()