shariar076 commited on
Commit
30d64ae
·
verified ·
1 Parent(s): 5311a81

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -1
app.py CHANGED
@@ -1,5 +1,62 @@
1
  import gradio as gr
2
- from .bn_llm_wrapper import generate
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  """
5
  For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
 
1
  import gradio as gr
2
+ import os
3
+ import torch
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GenerationConfig
5
+
6
+ model_path = os.environ.get("HF_REPO_ID")
7
+ access_token = os.environ.get("HF_TOKEN")
8
+
9
+
10
+ tokenizer = AutoTokenizer.from_pretrained(model_path, token=access_token)
11
+
12
+ bnb_config = BitsAndBytesConfig(
13
+ load_in_4bit=True,
14
+ # load_in_8bit=use_8_bit,
15
+ bnb_4bit_quant_type="nf4",
16
+ bnb_4bit_compute_dtype=getattr(torch, "bfloat16"),
17
+ bnb_4bit_use_double_quant=True,
18
+ )
19
+
20
+ model = AutoModelForCausalLM.from_pretrained(model_path, token=access_token,
21
+ quantization_config=bnb_config,
22
+ torch_dtype=torch.float16,
23
+ # attn_implementation="flash_attention_2",
24
+ device_map='auto')
25
+
26
+ if torch.cuda.is_available():
27
+ device = "cuda"
28
+ else:
29
+ device = "cpu"
30
+
31
+ def generate(
32
+ question,
33
+ context=None,
34
+ temperature=0.7,
35
+ top_p=0.7,
36
+ top_k=40,
37
+ num_beams=4,
38
+ max_new_tokens=256,):
39
+ prompt = f"### CONTEXT:\n{context}\n\n### QUESTION:\n{question}\n\n### ANSWER:"
40
+ inputs = tokenizer(prompt, return_tensors="pt")
41
+ input_ids = inputs["input_ids"].to(device)
42
+ generation_config = GenerationConfig(
43
+ temperature=temperature,
44
+ top_p=top_p,
45
+ top_k=top_k,
46
+ num_beams=num_beams,
47
+ )
48
+ # with torch.autocast("cuda"):
49
+ with torch.no_grad():
50
+ generation_output = model.generate(
51
+ input_ids=input_ids,
52
+ generation_config=generation_config,
53
+ return_dict_in_generate=True,
54
+ output_scores=True,
55
+ max_new_tokens=max_new_tokens,
56
+ )
57
+ seq = generation_output.sequences[0]
58
+ output = tokenizer.decode(seq)
59
+ return output
60
 
61
  """
62
  For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference