jbilcke-hf HF staff commited on
Commit
566b8be
1 Parent(s): f62c867

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -10
app.py CHANGED
@@ -22,18 +22,31 @@ def readb64(b64):
22
  return img
23
 
24
 
25
- # not sure why
 
 
26
  #import subprocess
27
  #subprocess.run('pip3 install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 
 
 
 
 
 
 
 
 
28
 
 
 
 
 
 
29
  model_id = "vikhyatk/moondream2"
30
- revision = "2024-04-02"
31
- tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
32
  moondream = AutoModelForCausalLM.from_pretrained(
33
- model_id, trust_remote_code=True, revision=revision,
34
- torch_dtype=torch.bfloat16, device_map={"": "cuda"},
35
- attn_implementation="flash_attention_2"
36
- )
37
  moondream.eval()
38
 
39
  def answer_question(secret_token, input, prompt):
@@ -60,11 +73,13 @@ def answer_question(secret_token, input, prompt):
60
 
61
  buffer = ""
62
  for new_text in streamer:
 
 
 
 
63
  buffer += new_text
64
 
65
- buffer.strip()
66
-
67
- return buffer
68
 
69
  with gr.Blocks() as demo:
70
  gr.HTML("""
 
22
  return img
23
 
24
 
25
+ #
26
+ # this version work in the official demo but not when I fork it, doesn't work, and I'm not sure why
27
+ #
28
  #import subprocess
29
  #subprocess.run('pip3 install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
30
+ #model_id = "vikhyatk/moondream2"
31
+ #revision = "2024-04-02"
32
+ #tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
33
+ #moondream = AutoModelForCausalLM.from_pretrained(
34
+ # model_id, trust_remote_code=True, revision=revision,
35
+ # torch_dtype=torch.bfloat16, device_map={"": "cuda"},
36
+ # attn_implementation="flash_attention_2"
37
+ #)
38
+ #moondream.eval()
39
 
40
+ # so let's use an older version
41
+ if torch.cuda.is_available():
42
+ device, dtype = "cuda", torch.float16
43
+ else:
44
+ device, dtype = "cpu", torch.float32
45
  model_id = "vikhyatk/moondream2"
46
+ tokenizer = AutoTokenizer.from_pretrained(model_id, revision="2024-03-06")
 
47
  moondream = AutoModelForCausalLM.from_pretrained(
48
+ model_id, trust_remote_code=True, revision="2024-03-06"
49
+ ).to(device=device, dtype=dtype)
 
 
50
  moondream.eval()
51
 
52
  def answer_question(secret_token, input, prompt):
 
73
 
74
  buffer = ""
75
  for new_text in streamer:
76
+
77
+ # do we really need this?
78
+ clean_text = re.sub("<$|<END$", "", new_text)
79
+
80
  buffer += new_text
81
 
82
+ return buffer.strip()
 
 
83
 
84
  with gr.Blocks() as demo:
85
  gr.HTML("""