rodrigomasini commited on
Commit
e8ae0c0
1 Parent(s): f525e0b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +100 -28
app.py CHANGED
@@ -1,32 +1,104 @@
1
  import streamlit as st
2
- from transformers import AutoTokenizer, pipeline, logging
3
- from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
4
- from huggingface_hub import snapshot_download
5
-
6
- #import shutil
7
- import os
8
-
9
- cwd = os.getcwd()
10
- cachedir = cwd+'/cache'
11
-
12
- # Check if the directory exists before creating it
13
- if not os.path.exists(cachedir):
14
- os.mkdir(cachedir)
15
-
16
- os.environ['HF_HOME'] = cachedir
17
-
18
- local_folder = cachedir + "/model"
19
-
20
-
21
- quantized_model_dir = "FPHam/Jackson_The_Formalizer_V2_13b_GPTQ"
22
-
23
- snapshot_download(repo_id=quantized_model_dir, local_dir=local_folder, local_dir_use_symlinks=True)
24
-
25
- model_basename = cachedir + "/model/Jackson2-4bit-128g-GPTQ"
26
-
27
- use_strict = False
28
-
29
- use_triton = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  tokenizer = AutoTokenizer.from_pretrained(local_folder, use_fast=False)
32
 
 
1
  import streamlit as st
2
+ from transformers import AutoTokenizer
3
+ from auto_gptq import AutoGPTQForCausalLM
4
+ import torch
5
+ import subprocess
6
+ import traceback
7
+
8
+ # Function to get memory info
9
+ def get_gpu_memory():
10
+ try:
11
+ result = subprocess.check_output(["nvidia-smi", "--query-gpu=memory.free,memory.total", "--format=csv,nounits,noheader"], text=True)
12
+ memory_info = [x.split(',') for x in result.strip().split('\n')]
13
+ memory_info = [{"free": int(x[0].strip()), "total": int(x[1].strip())} for x in memory_info]
14
+ except FileNotFoundError:
15
+ memory_info = [{"free": "N/A", "total": "N/A"}]
16
+ return memory_info
17
+
18
+ # Display GPU memory information before loading the model
19
+ gpu_memory_before = get_gpu_memory()
20
+ st.write(f"GPU Memory Info before loading the model: {gpu_memory_before}")
21
+
22
+ # Define pretrained model directory
23
+ pretrained_model_dir = "FPHam/Jackson_The_Formalizer_V2_13b_GPTQ"
24
+
25
+ # Check if CUDA is available and get the device
26
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
27
+
28
+ # Before allocating or loading the model, clear up memory if CUDA is available
29
+ if device == "cuda:0":
30
+ torch.cuda.empty_cache()
31
+
32
+ # Load tokenizer
33
+ tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=False)
34
+ tokenizer.pad_token = tokenizer.eos_token # Ensure padding token is set correctly for the model
35
+
36
+ # Attempt to load the model, catch any OOM errors
37
+ @st.cache_resource
38
+ def load_gptq_model():
39
+ model = AutoGPTQForCausalLM.from_quantized(
40
+ pretrained_model_dir,
41
+ model_basename="Jackson2-4bit-128g-GPTQ",
42
+ use_safetensors=True,
43
+ device=device,
44
+ disable_exllamav2=True
45
+ )
46
+ model.eval() # Set the model to inference mode
47
+ return model
48
+
49
+ model_loaded = False
50
+ # Attempt to load the model, catch any OOM errors
51
+ try:
52
+ model = load_gptq_model()
53
+ model_loaded = True
54
+ except RuntimeError as e:
55
+ if 'CUDA out of memory' in str(e):
56
+ st.error("CUDA out of memory while loading the model. Try reducing the model size or restarting the app.")
57
+ st.stop()
58
+ else:
59
+ raise e
60
+
61
+ if model_loaded:
62
+ # Display GPU memory information after loading the model
63
+ gpu_memory_after = get_gpu_memory()
64
+ st.write(f"GPU Memory Info after loading the model: {gpu_memory_after}")
65
+
66
+ col1, col2 = st.columns(2)
67
+ with col1:
68
+ user_input = st.text_input("Input a phrase")
69
+ with col2:
70
+ max_token = st.number_input(label="Select max number of generated tokens", min_value=1, max_value=512, value=50, step=5)
71
+
72
+ # Generate button
73
+ if st.button("Generate the prompt"):
74
+ try:
75
+ prompt_template = f'USER: {user_input}\nASSISTANT:'
76
+ inputs = tokenizer(prompt_template, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
77
+ inputs = inputs.to(device) # Move inputs to the same device as model
78
+ # Generate text using torch.inference_mode for better performance during inference
79
+ with torch.inference_mode():
80
+ output = model.generate(**inputs, max_new_tokens=max_token)
81
+
82
+ # Cut the tokens at the input length to display only the generated text
83
+ output_ids_cut = output[:, inputs["input_ids"].shape[1]:]
84
+ generated_text = tokenizer.decode(output_ids_cut[0], skip_special_tokens=True)
85
+
86
+ st.markdown(f"**Generated Text:**\n{generated_text}")
87
+ except RuntimeError as e:
88
+ if 'CUDA out of memory' in str(e):
89
+ st.error("CUDA out of memory during generation. Try reducing the input length or restarting the app.")
90
+ # Log the detailed error message
91
+ with open('error_log.txt', 'a') as f:
92
+ f.write(traceback.format_exc())
93
+ else:
94
+ # Log the error and re-raise it
95
+ with open('error_log.txt', 'a') as f:
96
+ f.write(traceback.format_exc())
97
+ raise e
98
+
99
+ # Display GPU memory information after generation
100
+ gpu_memory_after_generation = get_gpu_memory()
101
+ st.write(f"GPU Memory Info after generation: {gpu_memory_after_generation}")
102
 
103
  tokenizer = AutoTokenizer.from_pretrained(local_folder, use_fast=False)
104