316usman commited on
Commit
2876ad0
1 Parent(s): b058857

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -0
app.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import langchain
3
+ import pinecone
4
+ import transformers
5
+ import pinecone
6
+ import accelerate
7
+ from torch import cuda, bfloat16
8
+ from transformers import pipeline
9
+
10
+ from langchain.vectorstores import Chroma, Pinecone
11
+ from langchain.embeddings import CohereEmbeddings
12
+ from langchain.llms import HuggingFacePipeline
13
+ from langchain import LLMChain, PromptTemplate
14
+ from transformers import LlamaForCausalLM, LlamaTokenizer
15
+
16
+
17
+
18
+ st.title("Language Model Chain")
19
+ PINECONE_API_KEY = '80414b32-6e4f-40d5-aa3e-f9d09535006c'
20
+ PINECONE_API_ENV = 'northamerica-northeast1-gcp'
21
+ cohere_api_key = 'VQBpxCtpSiu3PLUyBBkNIdyQaM5qM8svfmnD3L4h'
22
+ pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV)
23
+ index_name = "langchain"
24
+ embeddings = CohereEmbeddings(cohere_api_key=cohere_api_key)
25
+ index = pinecone.Index("langchain")
26
+ print ("Program Started")
27
+ # selected_model = st.selectbox("Select Model", ["decapoda-research/llama-7b-hf", "chainyo/alpaca-lora-7b"])
28
+
29
+ # # Display the selected model
30
+ # st.write("Selected Model:", selected_model)
31
+
32
+ model_loaded = False
33
+ model = None
34
+ repo_id="decapoda-research/llama-7b-hf"
35
+
36
+ @st.cache(allow_output_mutation=True)
37
+ def load_model():
38
+ config = transformers.AutoConfig.from_pretrained(repo_id)
39
+ with accelerate.init_empty_weights():
40
+ fake_model = transformers.AutoModelForCausalLM.from_config(config)
41
+ device_map = accelerate.infer_auto_device_map(fake_model)
42
+ model = transformers.LlamaForCausalLM.from_pretrained(
43
+ repo_id,
44
+ load_in_8bit=True,
45
+ cache_dir="./cache",
46
+ )
47
+ tokenizer = LlamaTokenizer.from_pretrained(repo_id)
48
+ return model, tokenizer
49
+
50
+ if st.button("Load Model"):
51
+ model1, tokenizer1 = load_model()
52
+ model_loaded = True
53
+
54
+ print ("Model Loaded")
55
+
56
+ if model_loaded:
57
+ # Set up initial values for pipeline parameters
58
+ temperature = st.slider("Temperature 'randomness' of outputs, 0.0 is the min and 1.0 the max", min_value=0.0, max_value=1.0, value=0.1, step=0.1)
59
+ top_p = st.slider("Top P select from top tokens whose probability add up to 15%", min_value=0.0, max_value=1.0, value=0.1, step=0.1)
60
+ top_k = st.slider("Top K select from top 0 tokens (because zero, relies on top_p)", min_value=0, max_value=100, value=20, step=1)
61
+ max_new_tokens = st.slider("Max New Tokens max number of tokens to generate in the output", min_value=0, max_value=512, value=256, step=1)
62
+ repetition_penalty = st.slider("Repetition Penalty without this output begins repeating", min_value=0.0, max_value=2.0)
63
+ #Number of retrieved documents
64
+ num_of_docs = st.selectbox("Number of Options", range(2, 11), index=0)
65
+
66
+ query = st.text_area("Query Text", height=150)
67
+ show_documents = st.checkbox("Show Retrieved Documents")
68
+ # Set-up the Template
69
+ template = """Given the question "{instruction}" and it's relevant answers as "{answers}", summarize the {answers} to answer the question"""
70
+ prompt = PromptTemplate(input_variables=["instruction","answers"], template=template)
71
+
72
+
73
+ if st.button("Generate Text"):
74
+ #Call the pipeline and display the generated text
75
+ generate_text = pipeline(
76
+ model=model1, tokenizer=tokenizer1,
77
+ return_full_text=True, # langchain expects the full text
78
+ task='text-generation',
79
+ #device=device
80
+ # we pass model parameters here too
81
+ #stopping_criteria=stopping_criteria, # without this model will ramble
82
+ temperature=temperature, # 'randomness' of outputs, 0.0 is the min and 1.0 the max
83
+ top_p=top_p, # select from top tokens whose probability add up to 15%
84
+ top_k=top_k, # select from top 0 tokens (because zero, relies on top_p)
85
+ max_new_tokens=max_new_token, # mex number of tokens to generate in the output
86
+ repetition_penalty=repetition_penalty # without this output begins repeating
87
+ )
88
+
89
+
90
+ llm = HuggingFacePipeline(pipeline=generate_text)
91
+ llm_chain = LLMChain(llm=llm, prompt=prompt)
92
+
93
+ print ("Inside Function")
94
+ query_vector = embeddings.embed_query(query)
95
+ query_response = index.query(top_k=num_of_docs, include_metadata=True, vector=query_vector)
96
+ docs=[]
97
+ for result in query_response['matches']:
98
+ docs.append(result['metadata']['text'])
99
+ answers= ' '.join(docs)
100
+ if show_documents:
101
+ st.text_area("Retrieved Vectors", answers)
102
+ text = (llm_chain.predict(instruction=query, answers=answers))
103
+
104
+ st.text_area("Result",text)
105
+ cuda.empty_cache()