Spaces:

kouki321
/

Third_Try_Cag_pdf

Sleeping

App Files Files Community

kouki321 commited on May 27

Commit

e2c4e20

verified ·

1 Parent(s): fe55efb

Update app.py

Browse files

Files changed (1) hide show

app.py +114 -22

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ from transformers.cache_utils import DynamicCache
 import os
 from time import time
 import pandas as pd
 # ==============================
@@ -80,7 +81,7 @@ def load_model_and_tokenizer(doc_text_count):
     tokenizer = AutoTokenizer.from_pretrained(
         model_name,
         trust_remote_code=True,
-        model_max_length= 1.5*round(doc_text_count * 0.3 + 1)
     )
     model = AutoModelForCausalLM.from_pretrained(
         model_name,
@@ -103,7 +104,7 @@ def load_document_and_cache(file_path):
         t2=time()
         with open(file_path, "r", encoding="utf-8") as f:
             doc_text = f.read()
-        doc_text_count= len(doc_text)
         model, tokenizer = load_model_and_tokenizer(doc_text_count)
         system_prompt = f"""
         <|system|>
@@ -116,11 +117,108 @@ def load_document_and_cache(file_path):
         cache, origin_len = get_kv_cache(model, tokenizer, system_prompt)
         t3=time()
         print(f"{t3-t2}")
-        return cache, origin_len, doc_text ,doc_text_count,model, tokenizer
     except FileNotFoundError:
         st.error(f"Document file not found at {file_path}")
         return None, None, None, None
 # ==============================
 # Main Streamlit UI and Workflow
 # ==============================
@@ -129,22 +227,24 @@ st.title("🚀 DeepSeek QA: Supercharged Caching & Memory Dashboard")
 uploaded_file = st.file_uploader("📝 Upload your document (.txt)", type="txt")
 doc_text = None
-doc_text_count= None
 cache = None
 origin_len = None
 last_generation_time = None
 t1 = time()
 if uploaded_file:
     temp_file_path = "temp_document.txt"
     with open(temp_file_path, "wb") as f:
         f.write(uploaded_file.getvalue())
-    cache, origin_len, doc_text ,doc_text_count,model, tokenizer = load_document_and_cache(temp_file_path)
     with st.expander("📄 Document Preview"):
         st.text(doc_text[:500] + "..." if len(doc_text) > 500 else doc_text)
     query = st.text_input("🔎 Ask a question about the document:")
     if query and st.button("Generate Answer"):
-        with st.spinner("Generating answer..."):
             current_cache = clone_cache(cache)
             t_clone_end = time()
             Cache_create_time = t_clone_end - t1
@@ -154,33 +254,25 @@ if uploaded_file:
             <|assistant|>
             """.strip()
             input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids
             t_gen_start = time()
             output_ids = generate(model, input_ids, current_cache)
             response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
             t_gen_end = time()
             last_generation_time = t_gen_end - t_gen_start
             st.success("Answer:")
             st.write(response)
-            st.info(f"Cache create Time: {Cache_create_time:.2f} s  |  Generation Time: {last_generation_time:.2f} s   ")
             if st.button("💾 Save Cache"):
                 torch.save(clean_up(clone_cache(cache), origin_len), "saved_cache.pth")
                 st.success("Cache saved successfully!")
-        if query and st.button("calcul cache mb "):
-            t12=time()
-            cache_mem_bytes = calculate_cache_size(cache)
-            t123=time()
-            time_to=t123-t12
-            doc_text = len(doc_text)
-            st.info(f"time_to_calculate_cache_size: {time_to:} s |  cache mem bytes {cache_mem_bytes} MB  ")
-            st.info(f"doc_text_count: {doc_text_count:} char    ")
 else:
     st.info("Please upload a document to start.")
 # Sidebar: Load a previously saved cache
 st.sidebar.header("🛠️ Advanced Options")

 import os
 from time import time
 import pandas as pd
+import psutil
 # ==============================
     tokenizer = AutoTokenizer.from_pretrained(
         model_name,
         trust_remote_code=True,
+        model_max_length=2*round(doc_text_count * 0.3 + 1)
     )
     model = AutoModelForCausalLM.from_pretrained(
         model_name,
         t2=time()
         with open(file_path, "r", encoding="utf-8") as f:
             doc_text = f.read()
+        doc_text_count = len(doc_text)
         model, tokenizer = load_model_and_tokenizer(doc_text_count)
         system_prompt = f"""
         <|system|>
         cache, origin_len = get_kv_cache(model, tokenizer, system_prompt)
         t3=time()
         print(f"{t3-t2}")
+        return cache, origin_len, doc_text, doc_text_count
     except FileNotFoundError:
         st.error(f"Document file not found at {file_path}")
         return None, None, None, None
+# ==============================
+# System & Cache Resource Stats
+# ==============================
+def get_system_stats(doc_text=None, cache_mem_bytes=0):
+    ram = psutil.virtual_memory()
+    cpu = psutil.cpu_percent()
+    disk = psutil.disk_usage('/')
+    used, total = ram.used, ram.total
+    stats = {
+        "Input Tokens": st.session_state.get('input_tokens_count', 0),
+        "Output Tokens": st.session_state.get('output_tokens_count', 0),
+        "Generated Tokens": st.session_state.get('generated_tokens_count', 0),
+        "Document Size (chars)": len(doc_text) if doc_text else 0,
+        "Document Size (KB)": f"{len(doc_text.encode('utf-8')) / 1024:.2f}" if doc_text else 0,
+    }
+    if torch.cuda.is_available():
+        gpu_mem_alloc = torch.cuda.memory_allocated()
+        gpu_mem_total = torch.cuda.get_device_properties(0).total_memory
+        stats["GPU Used"] = sizeof_fmt(gpu_mem_alloc)
+        stats["GPU Total"] = sizeof_fmt(gpu_mem_total)
+        stats["GPU Usage (%)"] = round(100 * gpu_mem_alloc / gpu_mem_total, 2) if gpu_mem_total else 0
+    else:
+        stats["GPU Used"] = "N/A"
+        stats["GPU Total"] = "N/A"
+        stats["GPU Usage (%)"] = "N/A"
+    stats["KV Cache Memory Used"] = sizeof_fmt(cache_mem_bytes)
+    stats["KV Cache as % RAM"] = f"{(cache_mem_bytes / total) * 100:.2f}%" if total > 0 else "N/A"
+    stats["KV Cache as % GPU"] = (
+        f"{(cache_mem_bytes / torch.cuda.get_device_properties(0).total_memory) * 100:.2f}%"
+        if torch.cuda.is_available() else "N/A"
+    )
+    return stats
+def cache_stats_table(cache):
+    if cache is None:
+        return pd.DataFrame(), 0
+    rows = []
+    total_mem = 0
+    for i, (key, value) in enumerate(zip(cache.key_cache, cache.value_cache)):
+        key_mem = key.element_size() * key.nelement()
+        value_mem = value.element_size() * value.nelement()
+        total_mem += key_mem + value_mem
+        row = {
+            "Layer": i,
+            "Key Shape": str(tuple(key.shape)),
+            "Value Shape": str(tuple(value.shape)),
+            "Total Mem": sizeof_fmt(key_mem + value_mem),
+            "Last Key Tokens": str(tuple(key[..., -1:, :].shape)),
+            "Last Value Tokens": str(tuple(value[..., -1:, :].shape)),
+        }
+        rows.append(row)
+    return pd.DataFrame(rows), total_mem
+def resource_dashboard(cache, doc_text, generation_time=None, cache_clone_time=None):
+    cache_df, cache_mem_bytes = cache_stats_table(cache)
+    stats = get_system_stats(doc_text, cache_mem_bytes)
+    st.sidebar.header("🚦 Live Resource & Cache Dashboard")
+    st.sidebar.caption("See how your document and answers use your computer's memory and processing power. The KV Cache lets you answer questions super-fast!")
+    stats_table = pd.DataFrame(stats, index=["Value"]).T
+    st.sidebar.dataframe(stats_table, use_container_width=True, height=420)
+    if torch.cuda.is_available() and stats["GPU Usage (%)"] != "N/A":
+        gpu_pct = float(stats["GPU Usage (%)"])
+        st.sidebar.progress(int(min(gpu_pct, 100)), text=f"GPU Usage: {gpu_pct:.1f}%")
+    cache_pct_str = stats["KV Cache as % RAM"]
+    if isinstance(cache_pct_str, str) and cache_pct_str.endswith('%'):
+        try:
+            cache_pct = float(cache_pct_str[:-1])
+        except ValueError:
+            cache_pct = 0
+    else:
+        cache_pct = 0
+    st.sidebar.progress(int(min(cache_pct, 100)), text=f"KV Cache as RAM: {cache_pct:.1f}%")
+    if generation_time is not None or cache_clone_time is not None:
+        time_rows = []
+        if generation_time is not None:
+            time_rows.append({"Step": "Answer Generation", "Time (s)": f"{generation_time:.2f}"})
+        if cache_clone_time is not None:
+            time_rows.append({"Step": "Cache Copy", "Time (s)": f"{cache_clone_time:.2f}"})
+        st.sidebar.table(pd.DataFrame(time_rows))
+    with st.sidebar.expander("🧠 KV Cache Details (per Layer)", expanded=True):
+        st.markdown(
+            "The table below shows the shape, dtype, size, and memory used for each layer's cache in the neural network. Efficient caching speeds up new questions."
+        )
+        if not cache_df.empty:
+            st.dataframe(cache_df, use_container_width=True, height=340)
+        else:
+            st.info("No cache yet. Upload a document to see caching details.")
+# Initialize session state variables
+if 'generated_tokens_count' not in st.session_state:
+    st.session_state.generated_tokens_count = 0
+if 'input_tokens_count' not in st.session_state:
+    st.session_state.input_tokens_count = 0
+if 'output_tokens_count' not in st.session_state:
+    st.session_state.output_tokens_count = 0
 # ==============================
 # Main Streamlit UI and Workflow
 # ==============================
 uploaded_file = st.file_uploader("📝 Upload your document (.txt)", type="txt")
 doc_text = None
+doc_text_count = None
 cache = None
 origin_len = None
 last_generation_time = None
+last_cache_clone_time = None
 t1 = time()
 if uploaded_file:
     temp_file_path = "temp_document.txt"
     with open(temp_file_path, "wb") as f:
         f.write(uploaded_file.getvalue())
+    cache, origin_len, doc_text, doc_text_count = load_document_and_cache(temp_file_path)
     with st.expander("📄 Document Preview"):
         st.text(doc_text[:500] + "..." if len(doc_text) > 500 else doc_text)
     query = st.text_input("🔎 Ask a question about the document:")
     if query and st.button("Generate Answer"):
+        with st.spinner("Generating answer... (watch the sidebar for memory usage)"):
+            model, tokenizer = load_model_and_tokenizer(doc_text_count)
+            st.sidebar.write(f"Document character count: {len(doc_text)}")
             current_cache = clone_cache(cache)
             t_clone_end = time()
             Cache_create_time = t_clone_end - t1
             <|assistant|>
             """.strip()
             input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids
+            st.session_state.input_tokens_count += input_ids.shape[-1]
             t_gen_start = time()
             output_ids = generate(model, input_ids, current_cache)
+            generated_tokens_count = output_ids.shape[-1]
+            st.session_state.generated_tokens_count += generated_tokens_count
+            st.session_state.output_tokens_count = generated_tokens_count
             response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
             t_gen_end = time()
             last_generation_time = t_gen_end - t_gen_start
             st.success("Answer:")
             st.write(response)
+            st.info(f"Cache create Time: {Cache_create_time:.2f} s  |  Generation Time: {last_generation_time:.2f} s")
             if st.button("💾 Save Cache"):
                 torch.save(clean_up(clone_cache(cache), origin_len), "saved_cache.pth")
                 st.success("Cache saved successfully!")
+    resource_dashboard(cache, doc_text, last_generation_time, last_cache_clone_time)
 else:
     st.info("Please upload a document to start.")
+    resource_dashboard(None, None)
 # Sidebar: Load a previously saved cache
 st.sidebar.header("🛠️ Advanced Options")