Spaces:

rodrigomasini
/

rephrase

Paused

App Files Files Community

rodrigomasini commited on Nov 8, 2023

Commit

e8ae0c0

1 Parent(s): f525e0b

Update app.py

Browse files

Files changed (1) hide show

app.py +100 -28

app.py CHANGED Viewed

@@ -1,32 +1,104 @@
 import streamlit as st
-from transformers import AutoTokenizer, pipeline, logging
-from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
-from huggingface_hub import snapshot_download
-#import shutil
-import os
-cwd = os.getcwd()
-cachedir = cwd+'/cache'
-# Check if the directory exists before creating it
-if not os.path.exists(cachedir):
-    os.mkdir(cachedir)
-os.environ['HF_HOME'] = cachedir
-local_folder = cachedir + "/model"
-quantized_model_dir = "FPHam/Jackson_The_Formalizer_V2_13b_GPTQ"
-snapshot_download(repo_id=quantized_model_dir, local_dir=local_folder, local_dir_use_symlinks=True)
-model_basename = cachedir + "/model/Jackson2-4bit-128g-GPTQ"
-use_strict = False
-use_triton = False
 tokenizer = AutoTokenizer.from_pretrained(local_folder, use_fast=False)

 import streamlit as st
+from transformers import AutoTokenizer
+from auto_gptq import AutoGPTQForCausalLM
+import torch
+import subprocess
+import traceback
+# Function to get memory info
+def get_gpu_memory():
+    try:
+        result = subprocess.check_output(["nvidia-smi", "--query-gpu=memory.free,memory.total", "--format=csv,nounits,noheader"], text=True)
+        memory_info = [x.split(',') for x in result.strip().split('\n')]
+        memory_info = [{"free": int(x[0].strip()), "total": int(x[1].strip())} for x in memory_info]
+    except FileNotFoundError:
+        memory_info = [{"free": "N/A", "total": "N/A"}]
+    return memory_info
+# Display GPU memory information before loading the model
+gpu_memory_before = get_gpu_memory()
+st.write(f"GPU Memory Info before loading the model: {gpu_memory_before}")
+# Define pretrained model directory
+pretrained_model_dir = "FPHam/Jackson_The_Formalizer_V2_13b_GPTQ"
+# Check if CUDA is available and get the device
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+# Before allocating or loading the model, clear up memory if CUDA is available
+if device == "cuda:0":
+    torch.cuda.empty_cache()
+# Load tokenizer
+tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=False)
+tokenizer.pad_token = tokenizer.eos_token  # Ensure padding token is set correctly for the model
+# Attempt to load the model, catch any OOM errors
+@st.cache_resource
+def load_gptq_model():
+    model = AutoGPTQForCausalLM.from_quantized(
+        pretrained_model_dir,
+        model_basename="Jackson2-4bit-128g-GPTQ",
+        use_safetensors=True,
+        device=device,
+        disable_exllamav2=True
+    )
+    model.eval()  # Set the model to inference mode
+    return model
+model_loaded = False
+# Attempt to load the model, catch any OOM errors
+try:
+    model = load_gptq_model()
+    model_loaded = True
+except RuntimeError as e:
+    if 'CUDA out of memory' in str(e):
+        st.error("CUDA out of memory while loading the model. Try reducing the model size or restarting the app.")
+        st.stop()
+    else:
+        raise e
+if model_loaded:
+    # Display GPU memory information after loading the model
+    gpu_memory_after = get_gpu_memory()
+    st.write(f"GPU Memory Info after loading the model: {gpu_memory_after}")
+    col1, col2 = st.columns(2)
+    with col1:
+        user_input = st.text_input("Input a phrase")
+    with col2:
+        max_token = st.number_input(label="Select max number of generated tokens", min_value=1, max_value=512, value=50, step=5)
+    # Generate button
+    if st.button("Generate the prompt"):
+        try:
+            prompt_template = f'USER: {user_input}\nASSISTANT:'
+            inputs = tokenizer(prompt_template, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
+            inputs = inputs.to(device)  # Move inputs to the same device as model
+        # Generate text using torch.inference_mode for better performance during inference
+            with torch.inference_mode():
+                output = model.generate(**inputs, max_new_tokens=max_token)
+            # Cut the tokens at the input length to display only the generated text
+            output_ids_cut = output[:, inputs["input_ids"].shape[1]:]
+            generated_text = tokenizer.decode(output_ids_cut[0], skip_special_tokens=True)
+            st.markdown(f"**Generated Text:**\n{generated_text}")
+        except RuntimeError as e:
+            if 'CUDA out of memory' in str(e):
+                st.error("CUDA out of memory during generation. Try reducing the input length or restarting the app.")
+                # Log the detailed error message
+                with open('error_log.txt', 'a') as f:
+                    f.write(traceback.format_exc())
+            else:
+                # Log the error and re-raise it
+                with open('error_log.txt', 'a') as f:
+                    f.write(traceback.format_exc())
+                raise e
+        # Display GPU memory information after generation
+        gpu_memory_after_generation = get_gpu_memory()
+        st.write(f"GPU Memory Info after generation: {gpu_memory_after_generation}")
 tokenizer = AutoTokenizer.from_pretrained(local_folder, use_fast=False)