kodetr commited on
Commit
1327db3
·
verified ·
1 Parent(s): 9e26a79

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -63
app.py CHANGED
@@ -38,22 +38,23 @@ h3 {
38
  # ------- use model stunting V5 -------
39
  # -------------------------------------
40
 
41
- text_pipeline = pipeline(
42
- "text-generation",
43
- model=MODEL_ID,
44
- model_kwargs={"torch_dtype": torch.bfloat16},
45
- device_map="auto",
46
- )
 
47
  # -------------------------------------
48
  # ------- use model stunting V6 -------
49
  # -------------------------------------
50
 
51
- # model = AutoModelForCausalLM.from_pretrained(
52
- # MODEL_ID,
53
- # torch_dtype=torch.bfloat16,
54
- # device_map="auto",
55
- # )
56
- # tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
57
 
58
  @spaces.GPU
59
  def stream_chat(message: str, history: list, temperature: float, max_new_tokens: int, top_p: float, top_k: int, penalty: float):
@@ -71,72 +72,72 @@ def stream_chat(message: str, history: list, temperature: float, max_new_tokens:
71
  # -------------------------------------
72
 
73
  # Ubah ke format prompt-style string
74
- conversation_text = ""
75
- for turn in conversation:
76
- role = turn["role"]
77
- content = turn["content"]
78
- if role == "system":
79
- conversation_text += f"[SYSTEM]: {content}\n"
80
- elif role == "user":
81
- conversation_text += f"[USER]: {content}\n"
82
- elif role == "assistant":
83
- conversation_text += f"[ASSISTANT]: {content}\n"
84
 
85
- terminators = [
86
- text_pipeline.tokenizer.eos_token_id,
87
- text_pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
88
- ]
89
 
90
  # Hasil dari pipeline akan berupa list dengan dictionary berisi text
91
- outputs = text_pipeline(
92
- conversation_text,
93
- max_new_tokens=max_new_tokens,
94
- eos_token_id=terminators,
95
- do_sample=True,
96
- temperature=temperature,
97
- top_p=top_p,
98
- top_k=top_k,
99
- repetition_penalty=penalty
100
- )
101
 
102
  # 4. Ekstrak teks hasil dan stream per kalimat
103
- generated_text = outputs[0].get("generated_text", "")
104
- streamed_text = generated_text[len(conversation_text):].strip() # Hilangkan prompt awal
105
 
106
- buffer = ""
107
- for part in streamed_text.split(". "):
108
- buffer += part.strip() + ". "
109
- yield buffer
110
 
111
  # -------------------------------------
112
  # ------- use model stunting V6 -------
113
  # -------------------------------------
114
 
115
- # input_ids = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
116
- # inputs = tokenizer(input_ids, return_tensors="pt").to(0) #gpu 0, cpu 1
117
 
118
- # streamer = TextIteratorStreamer(tokenizer, timeout=60., skip_prompt=True, skip_special_tokens=True)
119
 
120
- # generate_kwargs = dict(
121
- # inputs,
122
- # streamer=streamer,
123
- # top_k=top_k,
124
- # top_p=top_p,
125
- # repetition_penalty=penalty,
126
- # max_new_tokens=max_new_tokens,
127
- # do_sample=True,
128
- # temperature=temperature,
129
- # pad_token_id=128000,
130
- # eos_token_id=[128001,128008,128009],
131
- # )
132
 
133
- # thread = Thread(target=model.generate, kwargs=generate_kwargs)
134
- # thread.start()
135
 
136
- # buffer = ""
137
- # for new_text in streamer:
138
- # buffer += new_text
139
- # yield buffer
140
 
141
 
142
 
 
38
  # ------- use model stunting V5 -------
39
  # -------------------------------------
40
 
41
+ # text_pipeline = pipeline(
42
+ # "text-generation",
43
+ # model=MODEL_ID,
44
+ # model_kwargs={"torch_dtype": torch.bfloat16},
45
+ # device_map="auto",
46
+ # )
47
+
48
  # -------------------------------------
49
  # ------- use model stunting V6 -------
50
  # -------------------------------------
51
 
52
+ model = AutoModelForCausalLM.from_pretrained(
53
+ MODEL_ID,
54
+ torch_dtype=torch.bfloat16,
55
+ device_map="auto",
56
+ )
57
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
58
 
59
  @spaces.GPU
60
  def stream_chat(message: str, history: list, temperature: float, max_new_tokens: int, top_p: float, top_k: int, penalty: float):
 
72
  # -------------------------------------
73
 
74
  # Ubah ke format prompt-style string
75
+ # conversation_text = ""
76
+ # for turn in conversation:
77
+ # role = turn["role"]
78
+ # content = turn["content"]
79
+ # if role == "system":
80
+ # conversation_text += f"[SYSTEM]: {content}\n"
81
+ # elif role == "user":
82
+ # conversation_text += f"[USER]: {content}\n"
83
+ # elif role == "assistant":
84
+ # conversation_text += f"[ASSISTANT]: {content}\n"
85
 
86
+ # terminators = [
87
+ # text_pipeline.tokenizer.eos_token_id,
88
+ # text_pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
89
+ # ]
90
 
91
  # Hasil dari pipeline akan berupa list dengan dictionary berisi text
92
+ # outputs = text_pipeline(
93
+ # conversation_text,
94
+ # max_new_tokens=max_new_tokens,
95
+ # eos_token_id=terminators,
96
+ # do_sample=True,
97
+ # temperature=temperature,
98
+ # top_p=top_p,
99
+ # top_k=top_k,
100
+ # repetition_penalty=penalty
101
+ # )
102
 
103
  # 4. Ekstrak teks hasil dan stream per kalimat
104
+ # generated_text = outputs[0].get("generated_text", "")
105
+ # streamed_text = generated_text[len(conversation_text):].strip() # Hilangkan prompt awal
106
 
107
+ # buffer = ""
108
+ # for part in streamed_text.split(". "):
109
+ # buffer += part.strip() + ". "
110
+ # yield buffer
111
 
112
  # -------------------------------------
113
  # ------- use model stunting V6 -------
114
  # -------------------------------------
115
 
116
+ input_ids = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
117
+ inputs = tokenizer(input_ids, return_tensors="pt").to(0) #gpu 0, cpu 1
118
 
119
+ streamer = TextIteratorStreamer(tokenizer, timeout=60., skip_prompt=True, skip_special_tokens=True)
120
 
121
+ generate_kwargs = dict(
122
+ inputs,
123
+ streamer=streamer,
124
+ top_k=top_k,
125
+ top_p=top_p,
126
+ repetition_penalty=penalty,
127
+ max_new_tokens=max_new_tokens,
128
+ do_sample=True,
129
+ temperature=temperature,
130
+ pad_token_id=128000,
131
+ eos_token_id=[128001,128008,128009],
132
+ )
133
 
134
+ thread = Thread(target=model.generate, kwargs=generate_kwargs)
135
+ thread.start()
136
 
137
+ buffer = ""
138
+ for new_text in streamer:
139
+ buffer += new_text
140
+ yield buffer
141
 
142
 
143