datawithsuman commited on
Commit
c6e5236
·
verified ·
1 Parent(s): 3c2c80c

Create app.py

Browse files

Prompt Optimization to save LLM API cost.

Files changed (1) hide show
  1. app.py +212 -0
app.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # !pip install -U pymupdf
2
+ # !pip install llama-index-embeddings-openai
3
+ # !pip install llama-index-llms-openai
4
+ # !pip install chromadb
5
+ # !pip install llama-index-vector-stores-chroma
6
+ # !pip install pydantic==1.10.11
7
+ # !pip install llama-index-retrievers-bm25
8
+ # !pip install sentence-transformers
9
+ # !pip install llmlingua
10
+ # !pip install accelerate
11
+ # !pip install rouge
12
+ # !pip install semantic-text-similarity
13
+ # !pip install evaluate
14
+ # !pip install streamlit
15
+
16
+ import os
17
+ import streamlit as st
18
+ import streamlit.components.v1 as components
19
+ import openai
20
+ from llama_index.llms.openai import OpenAI
21
+
22
+ import os
23
+ from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, StorageContext, PropertyGraphIndex
24
+ from llama_index.core.indices.property_graph import (
25
+ ImplicitPathExtractor,
26
+ SimpleLLMPathExtractor,
27
+ )
28
+ from llama_index.retrievers.bm25 import BM25Retriever
29
+ from llama_index.core.retrievers import BaseRetriever
30
+ from llama_index.core.node_parser import SentenceSplitter
31
+ from llama_index.embeddings.openai import OpenAIEmbedding
32
+ from llmlingua import PromptCompressor
33
+ from rouge import Rouge
34
+ from semantic_text_similarity.models import WebBertSimilarity
35
+ import nest_asyncio
36
+
37
+ # Apply nest_asyncio
38
+ nest_asyncio.apply()
39
+
40
+ # OpenAI credentials
41
+ key = os.getenv('MODEL_REPO_ID')
42
+ openai.api_key = key
43
+ os.environ["OPENAI_API_KEY"] = key
44
+
45
+ # Streamlit UI
46
+ st.title("Prompt Optimization for One-Stop Policy QA Bot")
47
+
48
+ uploaded_files = st.file_uploader("Upload a PDF file", type="pdf", accept_multiple_files=True)
49
+
50
+ if uploaded_files:
51
+ for uploaded_file in uploaded_files:
52
+ reader = SimpleDirectoryReader(input_files=[f"../data/{uploaded_file.name}"])
53
+ documents = reader.load_data()
54
+ st.success("File uploaded...")
55
+
56
+ # Indexing
57
+ index = PropertyGraphIndex.from_documents(
58
+ documents,
59
+ embed_model=OpenAIEmbedding(model_name="text-embedding-3-small"),
60
+ kg_extractors=[
61
+ ImplicitPathExtractor(),
62
+ SimpleLLMPathExtractor(
63
+ llm=OpenAI(model="gpt-3.5-turbo", temperature=0.3),
64
+ num_workers=4,
65
+ max_paths_per_chunk=10,
66
+ ),
67
+ ],
68
+ show_progress=True,
69
+ )
70
+
71
+ # Save Knowlege Graph
72
+ index.property_graph_store.save_networkx_graph(name="../data/kg.html")
73
+
74
+ # Display the graph in Streamlit
75
+ st.success("File Processed...")
76
+ st.success("Creating Knowledge Graph...")
77
+ HtmlFile = open("../data/kg.html", 'r', encoding='utf-8')
78
+ source_code = HtmlFile.read()
79
+ components.html(source_code, height= 500, width=700)
80
+
81
+ # Retrieval
82
+ kg_retriever = index.as_retriever(
83
+ include_text=True, # include source text, default True
84
+ )
85
+
86
+ # Generation
87
+ model = "gpt-3.5-turbo"
88
+
89
+ def get_context(query):
90
+ contexts = kg_retriever.retrieve(query)
91
+ context_list = [n.text for n in contexts]
92
+ return context_list
93
+
94
+
95
+ def res(prompt):
96
+
97
+ response = openai.chat.completions.create(
98
+ model=model,
99
+ messages=[
100
+ {"role":"system",
101
+ "content":"You are a helpful assistant who answers from the following context. If the answer can't be found in context, just say that I don't know, don't try to make up an answer"
102
+ },
103
+ {"role": "user",
104
+ "content": prompt,
105
+ }
106
+ ]
107
+ )
108
+
109
+ return [response.usage.prompt_tokens, response.usage.completion_tokens, response.usage.total_tokens, response.choices[0].message.content]
110
+
111
+
112
+ # Initialize session state for token summary, evaluation details, and chat messages
113
+ if "token_summary" not in st.session_state:
114
+ st.session_state.token_summary = []
115
+ if "messages" not in st.session_state:
116
+ st.session_state.messages = []
117
+
118
+ # Display chat messages from history on app rerun
119
+ for message in st.session_state.messages:
120
+ with st.chat_message(message["role"]):
121
+ st.markdown(message["content"])
122
+
123
+ # Accept user input
124
+ if prompt := st.chat_input("Enter your query:"):
125
+ st.success("Fetching info...")
126
+ # Add user message to chat history
127
+ st.session_state.messages.append({"role": "user", "content": prompt})
128
+ with st.chat_message("user"):
129
+ st.markdown(prompt)
130
+
131
+ # Generate response
132
+ # st.success("Fetching info...")
133
+ context_list = get_context(prompt)
134
+ context = " ".join(context_list)
135
+
136
+
137
+ # Original prompt response
138
+ full_prompt = "\n\n".join([context + prompt])
139
+ orig_res = res(full_prompt)
140
+ st.session_state.messages.append({"role": "assistant", "content": "Generating Original prompt response..."})
141
+ st.session_state.messages.append({"role": "assistant", "content": orig_res[3]})
142
+ st.success("Generating Original prompt response...")
143
+ with st.chat_message("assistant"):
144
+ st.markdown(orig_res[3])
145
+
146
+ # Compressed Response
147
+ st.session_state.messages.append({"role": "assistant", "content": "Generating Optimized prompt response..."})
148
+ st.success("Generating Optimized prompt response...")
149
+
150
+ llm_lingua = PromptCompressor(
151
+ model_name="microsoft/llmlingua-2-xlm-roberta-large-meetingbank",
152
+ use_llmlingua2=True, device_map="mps"
153
+ )
154
+
155
+ def prompt_compression(context, rate=0.5):
156
+ compressed_context = llm_lingua.compress_prompt(
157
+ context,
158
+ rate=rate,
159
+ force_tokens=["!", ".", "?", "\n"],
160
+ drop_consecutive=True,
161
+ )
162
+ return compressed_context
163
+ compressed_context = prompt_compression(context)
164
+ full_prompt = "\n\n".join([compressed_context['compressed_prompt'] + prompt])
165
+ compressed_res = res(full_prompt)
166
+ st.session_state.messages.append({"role": "assistant", "content": compressed_res[3]})
167
+ with st.chat_message("assistant"):
168
+ st.markdown(compressed_res[3])
169
+
170
+ # Save token summary and evaluation details to session state
171
+ rouge = Rouge()
172
+ scores = rouge.get_scores(compressed_res[3], orig_res[3])
173
+ webert_model = WebBertSimilarity(device='cpu')
174
+ similarity_score = webert_model.predict([(compressed_res[3], orig_res[3])])[0] / 5 * 100
175
+
176
+
177
+ # Display token summary
178
+ st.session_state.messages.append({"role": "assistant", "content": "Token Length Summary..."})
179
+ st.success('Token Length Summary...')
180
+ st.session_state.messages.append({"role": "assistant", "content": f"Original Prompt has {orig_res[0]} tokens"})
181
+ st.write(f"Original Prompt has {orig_res[0]} tokens")
182
+ st.session_state.messages.append({"role": "assistant", "content": f"Optimized Prompt has {compressed_res[0]} tokens"})
183
+ st.write(f"Optimized Prompt has {compressed_res[0]} tokens")
184
+
185
+ st.session_state.messages.append({"role": "assistant", "content": "Comparing Original and Optimized Prompt Response..."})
186
+ st.success("Comparing Original and Optimized Prompt Response...")
187
+ st.session_state.messages.append({"role": "assistant", "content": f"Rouge Score : {scores[0]['rouge-l']['f'] * 100}"})
188
+ st.write(f"Rouge Score : {scores[0]['rouge-l']['f'] * 100}")
189
+ st.session_state.messages.append({"role": "assistant", "content": f"Semantic Text Similarity Score : {similarity_score}"})
190
+ st.write(f"Semantic Text Similarity Score : {similarity_score}")
191
+
192
+ st.write(" ")
193
+ # origin_tokens = compressed_context['origin_tokens']
194
+ # compressed_tokens = compressed_context['compressed_tokens']
195
+ origin_tokens = orig_res[0]
196
+ compressed_tokens = compressed_res[0]
197
+ saving = (origin_tokens - compressed_tokens) * 0.06 / 1000
198
+ st.session_state.messages.append({"role": "assistant", "content": f"The optimized prompt has ${saving:.4f} saved in GPT-4."})
199
+ st.success(f"The optimized prompt has ${saving:.4f} saved in GPT-4.")
200
+
201
+
202
+ ### Future scope -
203
+
204
+ # 1. Make this runnig in JPMC system.
205
+ # 2. Scale it read multiple files at once.
206
+ # 3. Cache the llm lingua roberta model to save time in downloading model every time.
207
+ # 4. Play around with the llm lingua hyperparameters and observe changes in output and dollar value.
208
+
209
+ ### Refereces -
210
+
211
+ # 1. https://docs.llamaindex.ai/en/stable/understanding/
212
+ # 2. https://github.com/microsoft/LLMLingua/blob/main/examples/LLMLingua2.ipynb