it run but
colab t4
from transformers import AutoTokenizer, AutoModelForCausalLM
quantized_model = AutoModelForCausalLM.from_pretrained(
"ISTA-DASLab/Meta-Llama-3-70B-AQLM-PV-1Bit-1x16",
torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True,
)
tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Meta-Llama-3-70B-AQLM-PV-1Bit-1x16")
%%time
output = quantized_model.generate(tokenizer("The inventor of the electric lamp is", return_tensors="pt")["input_ids"].cuda(), min_new_tokens=128, max_new_tokens=128)
print(tokenizer.decode(output[0]))
import json
import textwrap
system_prompt = "You are a helpful assistant."
def get_prompt(human_prompt):
prompt_template=f"{system_prompt}\n\nUSER: {human_prompt} \nASSISTANT: "
return prompt_template
def remove_human_text(text):
return text.split('USER:', 1)[0]
def parse_text(data):
for item in data:
text = item['generated_text']
assistant_text_index = text.find('ASSISTANT:')
if assistant_text_index != -1:
assistant_text = text[assistant_text_index+len('ASSISTANT:'):].strip()
assistant_text = remove_human_text(assistant_text)
wrapped_text = textwrap.fill(assistant_text, width=100)
print("#####", wrapped_text)
# return assistant_text
from transformers import GenerationConfig, pipeline
pipe = pipeline(
"text-generation",
model=quantized_model,
tokenizer=tokenizer,
max_length=512,
temperature=0.7,
top_p=0.95,
do_sample=True,
)
%%time
prompt = '''Who is Napoleon Bonaparte?
'''
raw_output = pipe(get_prompt(prompt))
parse_text(raw_output)
Setting pad_token_id
to eos_token_id
:128001 for open-end generation.
Napoleon to, Unidos!'s. service. to. information to,encing; actions's. . from and to\\\\.
by,GEN, Nadu, Unidos,GEN, to\\. phone, Unidos, Unidos,encing,rieved.itations. to..POST,GEN
that\\,itation. to is. Commons,iating. the Aires, sẻ. service..POST's the.POST. to was\\ that
the services,inars\\ \\. account. to to,iable, Attribution,ше’s. Commons. information,encing,
loans with. service : the's,itations. . to will... the\\ need, Unidos. to need,GED.GEN need.
to,GEN to\\ that that the.POST.. . to,antro to to, Unidos. information. by,mination to, Aires.
to. browser, Nadu:. .’s's. service. account. to by,GEN.. by,้ email,iating. enough that was. that.
that\\\\'s. that the services that.. account's to. to, Nadu, Nadu..POST.
available,.EntityFrameworkCore. to the.awt.'s is to. 's,ше... to: Commons's the to's\\. phone the
by. that. understanding. enoughQuestion. email,antro with.. enough. email to\\. that.'s can.
with. service\\ -Cola with.’s: contact.. works, UnidosQuestion. Aires. email. account.
services..,..awt. information: Aires\\. to. needs\\, Nadu's the!, Commons. service. from,
with..awt.. information. means. Commons's, Scholar. account. by the services, Unidos the works:
information the account to. email. works. .: information that.. browser\\. visit. to. phone\\.
that. because. because the service..util. email,antro, Aires. works,ivery by. use work. services
the hosting need to\\. account.\\ the devices\\. information and\\.. to,iates the service,
Unidos..awt. to are: work; to, Unidos.. : .. by. enough. by to's., Unidos the website the's
service. that. the, Unidos to: information,iable,inars. service:\\, Unidos. account\\\\
CPU times: user 2min 23s, sys: 17.1 s, total: 2min 40s
Wall time: 2min 41s
????????????
؟؟؟؟؟؟
from transformers import AutoTokenizer, AutoModelForCausalLM
quantized_model = AutoModelForCausalLM.from_pretrained(
"ISTA-DASLab/Meta-Llama-3-70B-AQLM-PV-1Bit-1x16",
torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True,
)
tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Meta-Llama-3-70B-AQLM-PV-1Bit-1x16")
output = quantized_model.generate(tokenizer("The inventor of the electric lamp is", return_tensors="pt")["input_ids"].cuda(), min_new_tokens=128, max_new_tokens=128)
print(tokenizer.decode(output[0]))
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask
to obtain reliable results.
Setting pad_token_id
to eos_token_id
:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask
to obtain reliable results.
/usr/local/lib/python3.10/dist-packages/torch/utils/cpp_extension.py:1965: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
warnings.warn(
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:20: FutureWarning: torch.library.impl_abstract
was renamed to torch.library.register_fake
. Please use that instead; we will remove torch.library.impl_abstract
in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code1x16_matmat")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:33: FutureWarning: torch.library.impl_abstract
was renamed to torch.library.register_fake
. Please use that instead; we will remove torch.library.impl_abstract
in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code1x16_matmat_dequant")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:48: FutureWarning: torch.library.impl_abstract
was renamed to torch.library.register_fake
. Please use that instead; we will remove torch.library.impl_abstract
in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code1x16_matmat_dequant_transposed")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:62: FutureWarning: torch.library.impl_abstract
was renamed to torch.library.register_fake
. Please use that instead; we will remove torch.library.impl_abstract
in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code2x8_matmat")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:75: FutureWarning: torch.library.impl_abstract
was renamed to torch.library.register_fake
. Please use that instead; we will remove torch.library.impl_abstract
in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code2x8_matmat_dequant")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:88: FutureWarning: torch.library.impl_abstract
was renamed to torch.library.register_fake
. Please use that instead; we will remove torch.library.impl_abstract
in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code2x8_matmat_dequant_transposed")
The inventor of the electric lamp isEMPLARY металли металли металли металли металлиückückückückückückückückückückückückückückückück
What is the solution to the incomprehensible words in the response form?
from transformers import AutoTokenizer, AutoModelForCausalLM
quantized_model = AutoModelForCausalLM.from_pretrained(
"ISTA-DASLab/Meta-Llama-3-70B-AQLM-PV-1Bit-1x16",
torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True,
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-70B")
output = quantized_model.generate(tokenizer("What is the result 1+1=؟", return_tensors="pt")["input_ids"].cuda(), min_new_tokens=20, max_new_tokens=20)
print(tokenizer.decode(output[0]))
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask
to obtain reliable results.
Setting pad_token_id
to eos_token_id
:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask
to obtain reliable results.
/usr/local/lib/python3.10/dist-packages/torch/utils/cpp_extension.py:1965: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
warnings.warn(
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:20: FutureWarning: torch.library.impl_abstract
was renamed to torch.library.register_fake
. Please use that instead; we will remove torch.library.impl_abstract
in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code1x16_matmat")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:33: FutureWarning: torch.library.impl_abstract
was renamed to torch.library.register_fake
. Please use that instead; we will remove torch.library.impl_abstract
in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code1x16_matmat_dequant")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:48: FutureWarning: torch.library.impl_abstract
was renamed to torch.library.register_fake
. Please use that instead; we will remove torch.library.impl_abstract
in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code1x16_matmat_dequant_transposed")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:62: FutureWarning: torch.library.impl_abstract
was renamed to torch.library.register_fake
. Please use that instead; we will remove torch.library.impl_abstract
in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code2x8_matmat")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:75: FutureWarning: torch.library.impl_abstract
was renamed to torch.library.register_fake
. Please use that instead; we will remove torch.library.impl_abstract
in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code2x8_matmat_dequant")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:88: FutureWarning: torch.library.impl_abstract
was renamed to torch.library.register_fake
. Please use that instead; we will remove torch.library.impl_abstract
in a future version of PyTorch.
@torch
.library.impl_abstract("aqlm::code2x8_matmat_dequant_transposed")
<|begin_of_text|>What is the result 1+1=؟vetica uncon uncon uncon uncon uncon uncon uncon uncon uncon uncon uncon uncon uncon
from transformers import AutoTokenizer, AutoModelForCausalLM
quantized_model = AutoModelForCausalLM.from_pretrained(
"ISTA-DASLab/Meta-Llama-3-70B-AQLM-PV-1Bit-1x16",
torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True,
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-70B")
output = quantized_model.generate(tokenizer("1+1=؟", return_tensors="pt")["input_ids"].cuda(), min_new_tokens=20, max_new_tokens=20)
print(tokenizer.decode(output[0]))
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask
to obtain reliable results.
Setting pad_token_id
to eos_token_id
:128001 for open-end generation.
<|begin_of_text|>1+1=؟586774774774774774774774774774774774774774774774774774774774
What is the solution for the incomprehensible model outputs even with changing the token?
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Meta-Llama-3-70B-AQLM-PV-1Bit-1x16")
model = AutoModelForCausalLM.from_pretrained(
"ISTA-DASLab/Meta-Llama-3-70B-AQLM-PV-1Bit-1x16", device_map="auto"
)
prompt = """How many helicopters can a human eat in one sitting? Reply as a thug."""
model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
input_length = model_inputs.input_ids.shape[1]
generated_ids = model.generate(**model_inputs, max_new_tokens=20)
print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])
Flores Flores Flores Flores Flores Flores Flores Floresigneigneigneigneigneigneigneigneigneigneigneigne
baaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaad
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-70B")
model = AutoModelForCausalLM.from_pretrained(
"ISTA-DASLab/Meta-Llama-3-70B-AQLM-PV-1Bit-1x16", device_map="auto"
)
prompt = """How many helicopters can a human eat in one sitting? Reply as a thug."""
model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
input_length = model_inputs.input_ids.shape[1]
generated_ids = model.generate(**model_inputs, max_new_tokens=20)
print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])
Clyveticaveticaveticaveticaveticaveticaveticaveticaveticaveticaveticaveticaveticaveticaveticaveticaveticaveticavetica