Spaces:

kouki321
/

Third_Try_Cag_pdf

Running

App Files Files Community

kouki321 commited on May 28

Commit

25dcf7e

verified ·

1 Parent(s): c42bfcb

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -103

app.py CHANGED Viewed

@@ -5,7 +5,6 @@ from transformers.cache_utils import DynamicCache
 import os
 from time import time
 import pandas as pd
-import psutil
 # ==============================
@@ -18,7 +17,6 @@ def sizeof_fmt(num, suffix="B"):
         num /= 1024.0
     return f"{num:.2f} P{suffix}"
 # ==============================
 # Core Model and Caching Logic
 # ==============================
@@ -81,7 +79,7 @@ def load_model_and_tokenizer(doc_text_count):
     tokenizer = AutoTokenizer.from_pretrained(
         model_name,
         trust_remote_code=True,
-        model_max_length=2*round(doc_text_count * 0.3 + 1)
     )
     model = AutoModelForCausalLM.from_pretrained(
         model_name,
@@ -122,95 +120,6 @@ def load_document_and_cache(file_path):
         st.error(f"Document file not found at {file_path}")
         return None, None, None, None, None, None
-# ==============================
-# System & Cache Resource Stats
-# ==============================
-def get_system_stats(doc_text=None, cache_mem_bytes=0):
-    ram = psutil.virtual_memory()
-    cpu = psutil.cpu_percent()
-    disk = psutil.disk_usage('/')
-    used, total = ram.used, ram.total
-    stats = {
-        "Input Tokens": st.session_state.get('input_tokens_count', 0),
-        "Output Tokens": st.session_state.get('output_tokens_count', 0),
-        "Generated Tokens": st.session_state.get('generated_tokens_count', 0),
-        "Document Size (chars)": len(doc_text) if doc_text else 0,
-        "Document Size (KB)": f"{len(doc_text.encode('utf-8')) / 1024:.2f}" if doc_text else 0,
-    }
-    if torch.cuda.is_available():
-        gpu_mem_alloc = torch.cuda.memory_allocated()
-        gpu_mem_total = torch.cuda.get_device_properties(0).total_memory
-        stats["GPU Used"] = sizeof_fmt(gpu_mem_alloc)
-        stats["GPU Total"] = sizeof_fmt(gpu_mem_total)
-        stats["GPU Usage (%)"] = round(100 * gpu_mem_alloc / gpu_mem_total, 2) if gpu_mem_total else 0
-    else:
-        stats["GPU Used"] = "N/A"
-        stats["GPU Total"] = "N/A"
-        stats["GPU Usage (%)"] = "N/A"
-    stats["KV Cache Memory Used"] = sizeof_fmt(cache_mem_bytes)
-    stats["KV Cache as % RAM"] = f"{(cache_mem_bytes / total) * 100:.2f}%" if total > 0 else "N/A"
-    stats["KV Cache as % GPU"] = (
-        f"{(cache_mem_bytes / torch.cuda.get_device_properties(0).total_memory) * 100:.2f}%"
-        if torch.cuda.is_available() else "N/A"
-    )
-    return stats
-def cache_stats_table(cache):
-    if cache is None:
-        return pd.DataFrame(), 0
-    rows = []
-    total_mem = 0
-    for i, (key, value) in enumerate(zip(cache.key_cache, cache.value_cache)):
-        key_mem = key.element_size() * key.nelement()
-        value_mem = value.element_size() * value.nelement()
-        total_mem += key_mem + value_mem
-        row = {
-            "Layer": i,
-            "Key Shape": str(tuple(key.shape)),
-            "Value Shape": str(tuple(value.shape)),
-            "Total Mem": sizeof_fmt(key_mem + value_mem),
-            "Last Key Tokens": str(tuple(key[..., -1:, :].shape)),
-            "Last Value Tokens": str(tuple(value[..., -1:, :].shape)),
-        }
-        rows.append(row)
-    return pd.DataFrame(rows), total_mem
-def resource_dashboard(cache, doc_text, generation_time=None, cache_clone_time=None):
-    cache_df, cache_mem_bytes = cache_stats_table(cache)
-    stats = get_system_stats(doc_text, cache_mem_bytes)
-    st.sidebar.header("🚦 Live Resource & Cache Dashboard")
-    st.sidebar.caption("See how your document and answers use your computer's memory and processing power. The KV Cache lets you answer questions super-fast!")
-    # Use st.table for small stats tables for better rendering
-    stats_table = pd.DataFrame(list(stats.items()), columns=["Metric", "Value"])
-    st.sidebar.table(stats_table)
-    if torch.cuda.is_available() and stats["GPU Usage (%)"] != "N/A":
-        gpu_pct = float(stats["GPU Usage (%)"])
-        st.sidebar.progress(int(min(gpu_pct, 100)), text=f"GPU Usage: {gpu_pct:.1f}%")
-    cache_pct_str = stats["KV Cache as % RAM"]
-    if isinstance(cache_pct_str, str) and cache_pct_str.endswith('%'):
-        try:
-            cache_pct = float(cache_pct_str[:-1])
-        except ValueError:
-            cache_pct = 0
-    else:
-        cache_pct = 0
-    st.sidebar.progress(int(min(cache_pct, 100)), text=f"KV Cache as RAM: {cache_pct:.1f}%")
-    if generation_time is not None or cache_clone_time is not None:
-        time_rows = []
-        if generation_time is not None:
-            time_rows.append({"Step": "Answer Generation", "Time (s)": f"{generation_time:.2f}"})
-        if cache_clone_time is not None:
-            time_rows.append({"Step": "Cache Copy", "Time (s)": f"{cache_clone_time:.2f}"})
-        st.sidebar.table(pd.DataFrame(time_rows))
-    with st.sidebar.expander("🧠 KV Cache Details (per Layer)", expanded=True):
-        st.markdown(
-            "The table below shows the shape, dtype, size, and memory used for each layer's cache in the neural network. Efficient caching speeds up new questions."
-        )
-        if not cache_df.empty:
-            st.dataframe(cache_df, use_container_width=True, height=340)
-        else:
-            st.info("No cache yet. Upload a document to see caching details.")
 # Initialize session state variables
 if 'generated_tokens_count' not in st.session_state:
@@ -241,15 +150,33 @@ if uploaded_file:
     with open(temp_file_path, "wb") as f:
         f.write(uploaded_file.getvalue())
     cache, origin_len, doc_text, doc_text_count, model, tokenizer = load_document_and_cache(temp_file_path)
     with st.expander("📄 Document Preview"):
         st.text(doc_text[:500] + "..." if len(doc_text) > 500 else doc_text)
     query = st.text_input("🔎 Ask a question about the document:")
     if query and st.button("Generate Answer"):
         with st.spinner("Generating answer... (watch the sidebar for memory usage)"):
-            st.sidebar.write(f"Document character count: {len(doc_text)}")
             current_cache = clone_cache(cache)
             t_clone_end = time()
             Cache_create_time = t_clone_end - t1
             full_prompt = f"""
             <|user|>
             Question: {query}
@@ -257,8 +184,8 @@ if uploaded_file:
             """.strip()
             input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids
             max_new_tokens = max(32, int(input_ids.shape[-1] * 0.3))
-            print(f"Max new tokens: {max_new_tokens}")
             st.session_state.input_tokens_count += input_ids.shape[-1]
             t_gen_start = time()
             output_ids = generate(model, input_ids, current_cache, max_new_tokens=max_new_tokens)
             generated_tokens_count = output_ids.shape[-1]
@@ -267,17 +194,19 @@ if uploaded_file:
             response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
             t_gen_end = time()
             last_generation_time = t_gen_end - t_gen_start
             st.success("Answer:")
             st.write(response)
-            st.info(f"Cache create Time: {Cache_create_time:.2f} s  |  Generation Time: {last_generation_time:.2f} s")
-            if st.button("💾 Save Cache"):
-                torch.save(clean_up(clone_cache(cache), origin_len), "saved_cache.pth")
-                st.success("Cache saved successfully!")
-            # Add Reset button at the end
-    resource_dashboard(cache, doc_text, last_generation_time, last_cache_clone_time)
-else:
-    st.info("Please upload a document to start.")
-    resource_dashboard(None, None)
 # Sidebar: Load a previously saved cache
 st.sidebar.header("🛠️ Advanced Options")

 import os
 from time import time
 import pandas as pd
 # ==============================
         num /= 1024.0
     return f"{num:.2f} P{suffix}"
 # ==============================
 # Core Model and Caching Logic
 # ==============================
     tokenizer = AutoTokenizer.from_pretrained(
         model_name,
         trust_remote_code=True,
+        model_max_length=1.3*round(doc_text_count * 0.3 + 1)
     )
     model = AutoModelForCausalLM.from_pretrained(
         model_name,
         st.error(f"Document file not found at {file_path}")
         return None, None, None, None, None, None
 # Initialize session state variables
 if 'generated_tokens_count' not in st.session_state:
     with open(temp_file_path, "wb") as f:
         f.write(uploaded_file.getvalue())
     cache, origin_len, doc_text, doc_text_count, model, tokenizer = load_document_and_cache(temp_file_path)
+    # Document Info Display
     with st.expander("📄 Document Preview"):
         st.text(doc_text[:500] + "..." if len(doc_text) > 500 else doc_text)
+    # Collect System Stats AFTER doc upload
+    cache_df, cache_mem_bytes = cache_stats_table(cache)
+    stats = get_system_stats(doc_text, cache_mem_bytes)
+    # Track Time
+    t1 = time()
+    # Generate Info Line (Initial)
+    st.info(
+        f"Document Chars: {len(doc_text)} | Size: {stats['Document Size (KB)']} KB | "
+        f"GPU Used: {stats['GPU Used']} | GPU Usage: {stats['GPU Usage (%)']}% | "
+        f"KV Cache Memory: {stats['KV Cache Memory Used']} | "
+        f"Cache as % RAM: {stats['KV Cache as % RAM']} | Cache as % GPU: {stats['KV Cache as % GPU']}"
+    )
     query = st.text_input("🔎 Ask a question about the document:")
     if query and st.button("Generate Answer"):
         with st.spinner("Generating answer... (watch the sidebar for memory usage)"):
             current_cache = clone_cache(cache)
             t_clone_end = time()
             Cache_create_time = t_clone_end - t1
             full_prompt = f"""
             <|user|>
             Question: {query}
             """.strip()
             input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids
             max_new_tokens = max(32, int(input_ids.shape[-1] * 0.3))
             st.session_state.input_tokens_count += input_ids.shape[-1]
             t_gen_start = time()
             output_ids = generate(model, input_ids, current_cache, max_new_tokens=max_new_tokens)
             generated_tokens_count = output_ids.shape[-1]
             response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
             t_gen_end = time()
             last_generation_time = t_gen_end - t_gen_start
             st.success("Answer:")
             st.write(response)
+            # Unified Info Line AFTER Generation
+            st.info(
+                f"Document Chars: {len(doc_text)} | Size: {stats['Document Size (KB)']} KB | "
+                f"GPU Used: {stats['GPU Used']} | GPU Usage: {stats['GPU Usage (%)']}% | "
+                f"KV Cache Memory: {stats['KV Cache Memory Used']} | "
+                f"Cache as % RAM: {stats['KV Cache as % RAM']} | Cache as % GPU: {stats['KV Cache as % GPU']} | "
+                f"Cache Create Time: {Cache_create_time:.2f} s | Generation Time: {last_generation_time:.2f} s"
+            )
 # Sidebar: Load a previously saved cache
 st.sidebar.header("🛠️ Advanced Options")