kouki321 commited on
Commit
c6bbadb
Β·
verified Β·
1 Parent(s): 39fea48

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -100
app.py CHANGED
@@ -1,77 +1,9 @@
1
- import streamlit as st
2
  import torch
3
- from transformers import AutoTokenizer, AutoModelForCausalLM
4
- from transformers.cache_utils import DynamicCache
5
- import os
6
- from time import time
7
  import pandas as pd
8
- import psutil
9
-
10
- # ==============================
11
- # Helper: Human-readable bytes
12
- def sizeof_fmt(num, suffix="B"):
13
- # Formats bytes as human-readable (e.g. 1.5 GB)
14
- for unit in ["", "K", "M", "G", "T"]:
15
- if abs(num) < 1024.0:
16
- return f"{num:3.2f} {unit}{suffix}"
17
- num /= 1024.0
18
- return f"{num:.2f} P{suffix}"
19
-
20
- # ==============================
21
- # System & Cache Resource Stats
22
- def get_system_stats(doc_text=None, cache_mem_bytes=0):
23
- ram = psutil.virtual_memory()
24
- cpu = psutil.cpu_percent()
25
- disk = psutil.disk_usage('/')
26
- used, total = ram.used, ram.total
27
- stats = {
28
- "Input Tokens": st.session_state.input_tokens_count,
29
- "Output Tokens": st.session_state.output_tokens_count,
30
- "Generated Tokens": st.session_state.generated_tokens_count,
31
- "Document Size (chars)": len(doc_text) if doc_text else 0,
32
- "Document Size (KB)": f"{len(doc_text.encode('utf-8')) / 1024:.2f}" if doc_text else 0,
33
- }
34
- if torch.cuda.is_available():
35
- gpu_mem_alloc = torch.cuda.memory_allocated()
36
- gpu_mem_total = torch.cuda.get_device_properties(0).total_memory
37
- stats["GPU Used"] = sizeof_fmt(gpu_mem_alloc)
38
- stats["GPU Total"] = sizeof_fmt(gpu_mem_total)
39
- stats["GPU Usage (%)"] = round(100 * gpu_mem_alloc / gpu_mem_total, 2) if gpu_mem_total else 0
40
- else:
41
- stats["GPU Used"] = "N/A"
42
- stats["GPU Total"] = "N/A"
43
- stats["GPU Usage (%)"] = "N/A"
44
-
45
- stats["KV Cache Memory Used"] = sizeof_fmt(cache_mem_bytes)
46
- stats["KV Cache as % RAM"] = f"{(cache_mem_bytes / total) * 100:.2f}%" if total > 0 else "N/A"
47
- stats["KV Cache as % GPU"] = (
48
- f"{(cache_mem_bytes / torch.cuda.get_device_properties(0).total_memory) * 100:.2f}%"
49
- if torch.cuda.is_available() else "N/A"
50
- )
51
- return stats
52
-
53
- def cache_stats_table(cache):
54
- if cache is None:
55
- return pd.DataFrame(), 0
56
- rows = []
57
- total_mem = 0
58
- for i, (key, value) in enumerate(zip(cache.key_cache, cache.value_cache)):
59
- key_mem = key.element_size() * key.nelement()
60
- value_mem = value.element_size() * value.nelement()
61
- total_mem += key_mem + value_mem
62
- row = {
63
- "Layer": i,
64
- "Key Shape": str(tuple(key.shape)),
65
- "Value Shape": str(tuple(value.shape)),
66
- "Total Mem": sizeof_fmt(key_mem + value_mem),
67
- "Last Key Tokens": str(tuple(key[..., -1:, :].shape)),
68
- "Last Value Tokens": str(tuple(value[..., -1:, :].shape)),
69
- }
70
- rows.append(row)
71
- return pd.DataFrame(rows), total_mem
72
 
73
- # ==============================
74
- # Core Model and Caching Logic
75
  def generate(model, input_ids, past_key_values, max_new_tokens=50):
76
  """Token-by-token generation using cache for speed."""
77
  device = model.model.embed_tokens.weight.device
@@ -163,25 +95,6 @@ def load_document_and_cache(file_path):
163
  st.error(f"Document file not found at {file_path}")
164
  return None, None, None, None
165
 
166
- # ==============================
167
- # Main Streamlit UI and Workflow
168
- st.title("DeepSeek QA: Supercharged Caching & Memory Dashboard")
169
-
170
- # Initialize session state variables
171
- if "doc_uploaded" not in st.session_state:
172
- st.session_state.update({
173
- "doc_uploaded": False,
174
- "doc_text": None,
175
- "cache": None,
176
- "origin_len": None,
177
- "doc_text_count": None,
178
- "generated_tokens_count": 0,
179
- "input_tokens_count": 0,
180
- "output_tokens_count": 0,
181
- "cache_gen_duration": 0.0,
182
- "output_gen_duration": 0.0,
183
- })
184
-
185
  # File upload
186
  uploaded_file = st.file_uploader("πŸ“ Upload your document (.txt)", type="txt")
187
  if uploaded_file:
@@ -201,13 +114,39 @@ if uploaded_file:
201
  "output_tokens_count": 0,
202
  "cache_gen_duration": 0.0,
203
  "output_gen_duration": 0.0,
 
 
 
 
 
 
 
 
204
  })
205
 
206
  # Measure cache generation time
207
  cache_start_time = time()
208
- st.session_state["cache"], st.session_state["origin_len"], st.session_state["doc_text"], st.session_state["doc_text_count"] = load_document_and_cache(temp_file_path)
209
- cache_end_time = time()
210
- st.session_state["cache_gen_duration"] = cache_end_time - cache_start_time
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
 
212
  # Display document preview
213
  with st.expander("πŸ“„ Document Preview"):
@@ -218,6 +157,7 @@ if uploaded_file:
218
  if query and st.button("Generate Answer"):
219
  with st.spinner("Generating answer..."):
220
  try:
 
221
  model, tokenizer = load_model_and_tokenizer(st.session_state["doc_text_count"])
222
  current_cache = clone_cache(st.session_state["cache"])
223
 
@@ -226,7 +166,10 @@ if uploaded_file:
226
  full_prompt = f"<|user|>\nQuestion: {query}\n<|assistant|>"
227
  input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids
228
  st.session_state["input_tokens_count"] += input_ids.shape[-1]
 
 
229
  output_ids, generated_tokens_count = generate(model, input_ids, current_cache)
 
230
  response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
231
  output_gen_end_time = time()
232
  st.session_state["output_gen_duration"] = output_gen_end_time - output_gen_start_time
@@ -242,15 +185,22 @@ if uploaded_file:
242
  f"Output Generation Time: {st.session_state['output_gen_duration']:.2f} seconds"
243
  )
244
 
245
- # Display cache stats table
246
- table, total_mem = cache_stats_table(st.session_state["cache"])
247
- if not table.empty:
248
- st.write("Cache Statistics Table:")
249
- st.dataframe(table)
250
- else:
251
- st.write("No data available in the cache.")
 
 
 
252
 
253
  except Exception as e:
254
  st.error(f"Failed to generate answer: {str(e)}")
 
 
 
 
255
  else:
256
  st.info("Please upload a document to start.")
 
 
1
  import torch
 
 
 
 
2
  import pandas as pd
3
+ import streamlit as st
4
+ from time import time
5
+ from transformers import AutoTokenizer, AutoModelForCausalLM
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
 
 
7
  def generate(model, input_ids, past_key_values, max_new_tokens=50):
8
  """Token-by-token generation using cache for speed."""
9
  device = model.model.embed_tokens.weight.device
 
95
  st.error(f"Document file not found at {file_path}")
96
  return None, None, None, None
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  # File upload
99
  uploaded_file = st.file_uploader("πŸ“ Upload your document (.txt)", type="txt")
100
  if uploaded_file:
 
114
  "output_tokens_count": 0,
115
  "cache_gen_duration": 0.0,
116
  "output_gen_duration": 0.0,
117
+ "stats_table": pd.DataFrame(columns=[
118
+ "Timestamp",
119
+ "Document Characters",
120
+ "Document Size (KB)",
121
+ "Cache Memory Used",
122
+ "Cache as % RAM",
123
+ "Cache as % GPU"
124
+ ])
125
  })
126
 
127
  # Measure cache generation time
128
  cache_start_time = time()
129
+ try:
130
+ # Simulated function `load_document_and_cache`
131
+ st.session_state["cache"], st.session_state["origin_len"], st.session_state["doc_text"], st.session_state["doc_text_count"] = load_document_and_cache(temp_file_path)
132
+ cache_end_time = time()
133
+ st.session_state["cache_gen_duration"] = cache_end_time - cache_start_time
134
+
135
+ # Log the resource usage after cache generation
136
+ log_time = time()
137
+ total_ram = torch.cuda.get_device_properties(0).total_memory if torch.cuda.is_available() else 1
138
+ cache_mem_bytes = st.session_state["cache"].get_cache_memory() if st.session_state["cache"] else 0
139
+ st.session_state["stats_table"] = log_resource_usage(
140
+ st.session_state["stats_table"],
141
+ st.session_state["doc_text"],
142
+ cache_mem_bytes,
143
+ total_ram,
144
+ log_time
145
+ )
146
+
147
+ except Exception as e:
148
+ st.error(f"Failed to generate cache: {str(e)}")
149
+ st.stop()
150
 
151
  # Display document preview
152
  with st.expander("πŸ“„ Document Preview"):
 
157
  if query and st.button("Generate Answer"):
158
  with st.spinner("Generating answer..."):
159
  try:
160
+ # Simulated function `load_model_and_tokenizer`
161
  model, tokenizer = load_model_and_tokenizer(st.session_state["doc_text_count"])
162
  current_cache = clone_cache(st.session_state["cache"])
163
 
 
166
  full_prompt = f"<|user|>\nQuestion: {query}\n<|assistant|>"
167
  input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids
168
  st.session_state["input_tokens_count"] += input_ids.shape[-1]
169
+
170
+ # Simulated function `generate`
171
  output_ids, generated_tokens_count = generate(model, input_ids, current_cache)
172
+
173
  response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
174
  output_gen_end_time = time()
175
  st.session_state["output_gen_duration"] = output_gen_end_time - output_gen_start_time
 
185
  f"Output Generation Time: {st.session_state['output_gen_duration']:.2f} seconds"
186
  )
187
 
188
+ # Log the resource usage after output generation
189
+ log_time = time()
190
+ cache_mem_bytes = st.session_state["cache"].get_cache_memory() if st.session_state["cache"] else 0
191
+ st.session_state["stats_table"] = log_resource_usage(
192
+ st.session_state["stats_table"],
193
+ st.session_state["doc_text"],
194
+ cache_mem_bytes,
195
+ total_ram,
196
+ log_time
197
+ )
198
 
199
  except Exception as e:
200
  st.error(f"Failed to generate answer: {str(e)}")
201
+
202
+ # Display the stats table
203
+ st.write("πŸ“Š Resource Usage Log:")
204
+ st.dataframe(st.session_state["stats_table"])
205
  else:
206
  st.info("Please upload a document to start.")