Spaces:
Running
on
Zero
Running
on
Zero
Provide the previous prompt as "History"
Browse files
app.py
CHANGED
@@ -26,7 +26,6 @@ If you duplicate this space, make sure you have access to [meta-llama/Llama-2-7b
|
|
26 |
because this model uses it as a tokenizer.
|
27 |
|
28 |
# Note: Use this model for only for completing sentences and instruction following.
|
29 |
-
## While the user interface is a chatbot for convenience, this is an instruction tuned model not fine-tuned for chatbot tasks. As such, the model is not provided a chat history and will complete your text based on the last given prompt only.
|
30 |
"""
|
31 |
|
32 |
LICENSE = """
|
@@ -35,8 +34,6 @@ LICENSE = """
|
|
35 |
---
|
36 |
As a derivative work of [OpenELM-3B-Instruct](https://huggingface.co/apple/OpenELM-3B-Instruct) by Apple,
|
37 |
this demo is governed by the original [license](https://huggingface.co/apple/OpenELM-3B-Instruct/blob/main/LICENSE).
|
38 |
-
|
39 |
-
This demo Space was created by [Doron Adler](https://linktr.ee/Norod78)
|
40 |
"""
|
41 |
|
42 |
if not torch.cuda.is_available():
|
@@ -51,6 +48,7 @@ if torch.cuda.is_available():
|
|
51 |
if tokenizer.pad_token == None:
|
52 |
tokenizer.pad_token = tokenizer.eos_token
|
53 |
tokenizer.pad_token_id = tokenizer.eos_token_id
|
|
|
54 |
|
55 |
@spaces.GPU
|
56 |
def generate(
|
@@ -63,6 +61,13 @@ def generate(
|
|
63 |
repetition_penalty: float = 1.4,
|
64 |
) -> Iterator[str]:
|
65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
input_ids = tokenizer([message], return_tensors="pt").input_ids
|
67 |
if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
|
68 |
input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
|
@@ -82,7 +87,7 @@ def generate(
|
|
82 |
pad_token_id = tokenizer.eos_token_id,
|
83 |
repetition_penalty=repetition_penalty,
|
84 |
no_repeat_ngram_size=5,
|
85 |
-
early_stopping=
|
86 |
)
|
87 |
t = Thread(target=model.generate, kwargs=generate_kwargs)
|
88 |
t.start()
|
|
|
26 |
because this model uses it as a tokenizer.
|
27 |
|
28 |
# Note: Use this model for only for completing sentences and instruction following.
|
|
|
29 |
"""
|
30 |
|
31 |
LICENSE = """
|
|
|
34 |
---
|
35 |
As a derivative work of [OpenELM-3B-Instruct](https://huggingface.co/apple/OpenELM-3B-Instruct) by Apple,
|
36 |
this demo is governed by the original [license](https://huggingface.co/apple/OpenELM-3B-Instruct/blob/main/LICENSE).
|
|
|
|
|
37 |
"""
|
38 |
|
39 |
if not torch.cuda.is_available():
|
|
|
48 |
if tokenizer.pad_token == None:
|
49 |
tokenizer.pad_token = tokenizer.eos_token
|
50 |
tokenizer.pad_token_id = tokenizer.eos_token_id
|
51 |
+
model.config.pad_token_id = tokenizer.eos_token_id
|
52 |
|
53 |
@spaces.GPU
|
54 |
def generate(
|
|
|
61 |
repetition_penalty: float = 1.4,
|
62 |
) -> Iterator[str]:
|
63 |
|
64 |
+
historical_text = ""
|
65 |
+
#Prepend the entire chat history to the message with new lines between each message
|
66 |
+
for user, assistant in chat_history:
|
67 |
+
historical_text += f"\n{user}\n{assistant}"
|
68 |
+
|
69 |
+
if len(historical_text) > 0:
|
70 |
+
message = historical_text + f"\n{message}"
|
71 |
input_ids = tokenizer([message], return_tensors="pt").input_ids
|
72 |
if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
|
73 |
input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
|
|
|
87 |
pad_token_id = tokenizer.eos_token_id,
|
88 |
repetition_penalty=repetition_penalty,
|
89 |
no_repeat_ngram_size=5,
|
90 |
+
early_stopping=False,
|
91 |
)
|
92 |
t = Thread(target=model.generate, kwargs=generate_kwargs)
|
93 |
t.start()
|