Spaces:
Paused
Paused
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import langchain
|
3 |
+
import pinecone
|
4 |
+
import transformers
|
5 |
+
import pinecone
|
6 |
+
import accelerate
|
7 |
+
from torch import cuda, bfloat16
|
8 |
+
from transformers import pipeline
|
9 |
+
|
10 |
+
from langchain.vectorstores import Chroma, Pinecone
|
11 |
+
from langchain.embeddings import CohereEmbeddings
|
12 |
+
from langchain.llms import HuggingFacePipeline
|
13 |
+
from langchain import LLMChain, PromptTemplate
|
14 |
+
from transformers import LlamaForCausalLM, LlamaTokenizer
|
15 |
+
|
16 |
+
|
17 |
+
|
18 |
+
st.title("Language Model Chain")
|
19 |
+
PINECONE_API_KEY = '80414b32-6e4f-40d5-aa3e-f9d09535006c'
|
20 |
+
PINECONE_API_ENV = 'northamerica-northeast1-gcp'
|
21 |
+
cohere_api_key = 'VQBpxCtpSiu3PLUyBBkNIdyQaM5qM8svfmnD3L4h'
|
22 |
+
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV)
|
23 |
+
index_name = "langchain"
|
24 |
+
embeddings = CohereEmbeddings(cohere_api_key=cohere_api_key)
|
25 |
+
index = pinecone.Index("langchain")
|
26 |
+
print ("Program Started")
|
27 |
+
# selected_model = st.selectbox("Select Model", ["decapoda-research/llama-7b-hf", "chainyo/alpaca-lora-7b"])
|
28 |
+
|
29 |
+
# # Display the selected model
|
30 |
+
# st.write("Selected Model:", selected_model)
|
31 |
+
|
32 |
+
model_loaded = False
|
33 |
+
model = None
|
34 |
+
repo_id="decapoda-research/llama-7b-hf"
|
35 |
+
|
36 |
+
@st.cache(allow_output_mutation=True)
|
37 |
+
def load_model():
|
38 |
+
config = transformers.AutoConfig.from_pretrained(repo_id)
|
39 |
+
with accelerate.init_empty_weights():
|
40 |
+
fake_model = transformers.AutoModelForCausalLM.from_config(config)
|
41 |
+
device_map = accelerate.infer_auto_device_map(fake_model)
|
42 |
+
model = transformers.LlamaForCausalLM.from_pretrained(
|
43 |
+
repo_id,
|
44 |
+
load_in_8bit=True,
|
45 |
+
cache_dir="./cache",
|
46 |
+
)
|
47 |
+
tokenizer = LlamaTokenizer.from_pretrained(repo_id)
|
48 |
+
return model, tokenizer
|
49 |
+
|
50 |
+
if st.button("Load Model"):
|
51 |
+
model1, tokenizer1 = load_model()
|
52 |
+
model_loaded = True
|
53 |
+
|
54 |
+
print ("Model Loaded")
|
55 |
+
|
56 |
+
if model_loaded:
|
57 |
+
# Set up initial values for pipeline parameters
|
58 |
+
temperature = st.slider("Temperature 'randomness' of outputs, 0.0 is the min and 1.0 the max", min_value=0.0, max_value=1.0, value=0.1, step=0.1)
|
59 |
+
top_p = st.slider("Top P select from top tokens whose probability add up to 15%", min_value=0.0, max_value=1.0, value=0.1, step=0.1)
|
60 |
+
top_k = st.slider("Top K select from top 0 tokens (because zero, relies on top_p)", min_value=0, max_value=100, value=20, step=1)
|
61 |
+
max_new_tokens = st.slider("Max New Tokens max number of tokens to generate in the output", min_value=0, max_value=512, value=256, step=1)
|
62 |
+
repetition_penalty = st.slider("Repetition Penalty without this output begins repeating", min_value=0.0, max_value=2.0)
|
63 |
+
#Number of retrieved documents
|
64 |
+
num_of_docs = st.selectbox("Number of Options", range(2, 11), index=0)
|
65 |
+
|
66 |
+
query = st.text_area("Query Text", height=150)
|
67 |
+
show_documents = st.checkbox("Show Retrieved Documents")
|
68 |
+
# Set-up the Template
|
69 |
+
template = """Given the question "{instruction}" and it's relevant answers as "{answers}", summarize the {answers} to answer the question"""
|
70 |
+
prompt = PromptTemplate(input_variables=["instruction","answers"], template=template)
|
71 |
+
|
72 |
+
|
73 |
+
if st.button("Generate Text"):
|
74 |
+
#Call the pipeline and display the generated text
|
75 |
+
generate_text = pipeline(
|
76 |
+
model=model1, tokenizer=tokenizer1,
|
77 |
+
return_full_text=True, # langchain expects the full text
|
78 |
+
task='text-generation',
|
79 |
+
#device=device
|
80 |
+
# we pass model parameters here too
|
81 |
+
#stopping_criteria=stopping_criteria, # without this model will ramble
|
82 |
+
temperature=temperature, # 'randomness' of outputs, 0.0 is the min and 1.0 the max
|
83 |
+
top_p=top_p, # select from top tokens whose probability add up to 15%
|
84 |
+
top_k=top_k, # select from top 0 tokens (because zero, relies on top_p)
|
85 |
+
max_new_tokens=max_new_token, # mex number of tokens to generate in the output
|
86 |
+
repetition_penalty=repetition_penalty # without this output begins repeating
|
87 |
+
)
|
88 |
+
|
89 |
+
|
90 |
+
llm = HuggingFacePipeline(pipeline=generate_text)
|
91 |
+
llm_chain = LLMChain(llm=llm, prompt=prompt)
|
92 |
+
|
93 |
+
print ("Inside Function")
|
94 |
+
query_vector = embeddings.embed_query(query)
|
95 |
+
query_response = index.query(top_k=num_of_docs, include_metadata=True, vector=query_vector)
|
96 |
+
docs=[]
|
97 |
+
for result in query_response['matches']:
|
98 |
+
docs.append(result['metadata']['text'])
|
99 |
+
answers= ' '.join(docs)
|
100 |
+
if show_documents:
|
101 |
+
st.text_area("Retrieved Vectors", answers)
|
102 |
+
text = (llm_chain.predict(instruction=query, answers=answers))
|
103 |
+
|
104 |
+
st.text_area("Result",text)
|
105 |
+
cuda.empty_cache()
|