it run but

#1
by sdyy - opened

colab t4

from transformers import AutoTokenizer, AutoModelForCausalLM

quantized_model = AutoModelForCausalLM.from_pretrained(
"ISTA-DASLab/Meta-Llama-3-70B-AQLM-PV-1Bit-1x16",
torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True,
)
tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Meta-Llama-3-70B-AQLM-PV-1Bit-1x16")

%%time
output = quantized_model.generate(tokenizer("The inventor of the electric lamp is", return_tensors="pt")["input_ids"].cuda(), min_new_tokens=128, max_new_tokens=128)
print(tokenizer.decode(output[0]))

import json
import textwrap

system_prompt = "You are a helpful assistant."

def get_prompt(human_prompt):
prompt_template=f"{system_prompt}\n\nUSER: {human_prompt} \nASSISTANT: "
return prompt_template

def remove_human_text(text):
return text.split('USER:', 1)[0]

def parse_text(data):
for item in data:
text = item['generated_text']
assistant_text_index = text.find('ASSISTANT:')
if assistant_text_index != -1:
assistant_text = text[assistant_text_index+len('ASSISTANT:'):].strip()
assistant_text = remove_human_text(assistant_text)
wrapped_text = textwrap.fill(assistant_text, width=100)
print("#####", wrapped_text)
# return assistant_text

from transformers import GenerationConfig, pipeline

pipe = pipeline(
"text-generation",
model=quantized_model,
tokenizer=tokenizer,
max_length=512,
temperature=0.7,
top_p=0.95,
do_sample=True,
)

%%time
prompt = '''Who is Napoleon Bonaparte?
'''
raw_output = pipe(get_prompt(prompt))

parse_text(raw_output)

Setting pad_token_id to eos_token_id:128001 for open-end generation.

Napoleon to, Unidos!'s. service. to. information to,encing; actions's. . from and to\\\\.

by,GEN, Nadu, Unidos,GEN, to\\. phone, Unidos, Unidos,encing,rieved.itations. to..POST,GEN
that\\,itation. to is. Commons,iating. the Aires, sẻ. service..POST's the.POST. to was\\ that
the services,inars\\ \\. account. to to,iable, Attribution,ше’s. Commons. information,encing,
loans with. service : the's,itations. . to will... the\\ need, Unidos. to need,GED.GEN need.
to,GEN to\\ that that the.POST.. . to,antro to to, Unidos. information. by,mination to, Aires.
to. browser, Nadu:. .’s's. service. account. to by,GEN.. by,้ email,iating. enough that was. that.
that\\\\'s. that the services that.. account's to. to, Nadu, Nadu..POST.
available,.EntityFrameworkCore. to the.awt.'s is to. 's,ше... to: Commons's the to's\\. phone the
by. that. understanding. enoughQuestion. email,antro with.. enough. email to\\. that.'s can.
with. service\\ -Cola with.’s: contact.. works, UnidosQuestion. Aires. email. account.
services..,..awt. information: Aires\\. to. needs\\, Nadu's the!, Commons. service. from,
with..awt.. information. means. Commons's, Scholar. account. by the services, Unidos the works:
information the account to. email. works. .: information that.. browser\\. visit. to. phone\\.
that. because. because the service..util. email,antro, Aires. works,ivery by. use work. services
the hosting need to\\. account.\\ the devices\\. information and\\.. to,iates the service,
Unidos..awt. to are: work; to, Unidos.. : .. by. enough. by to's., Unidos the website the's
service. that. the, Unidos to: information,iable,inars. service:\\, Unidos. account\\\\
CPU times: user 2min 23s, sys: 17.1 s, total: 2min 40s
Wall time: 2min 41s

????????????

؟؟؟؟؟؟

from transformers import AutoTokenizer, AutoModelForCausalLM

quantized_model = AutoModelForCausalLM.from_pretrained(
"ISTA-DASLab/Meta-Llama-3-70B-AQLM-PV-1Bit-1x16",
torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True,
)
tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Meta-Llama-3-70B-AQLM-PV-1Bit-1x16")

output = quantized_model.generate(tokenizer("The inventor of the electric lamp is", return_tensors="pt")["input_ids"].cuda(), min_new_tokens=128, max_new_tokens=128)
print(tokenizer.decode(output[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
/usr/local/lib/python3.10/dist-packages/torch/utils/cpp_extension.py:1965: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
warnings.warn(
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:20: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch .library.impl_abstract("aqlm::code1x16_matmat")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:33: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch .library.impl_abstract("aqlm::code1x16_matmat_dequant")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:48: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch .library.impl_abstract("aqlm::code1x16_matmat_dequant_transposed")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:62: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch .library.impl_abstract("aqlm::code2x8_matmat")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:75: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch .library.impl_abstract("aqlm::code2x8_matmat_dequant")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:88: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch .library.impl_abstract("aqlm::code2x8_matmat_dequant_transposed")
The inventor of the electric lamp isEMPLARY металли металли металли металли металлиückückückückückückückückückückückückückückückück

What is the solution to the incomprehensible words in the response form?

from transformers import AutoTokenizer, AutoModelForCausalLM

quantized_model = AutoModelForCausalLM.from_pretrained(
"ISTA-DASLab/Meta-Llama-3-70B-AQLM-PV-1Bit-1x16",
torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True,
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-70B")

output = quantized_model.generate(tokenizer("What is the result 1+1=؟", return_tensors="pt")["input_ids"].cuda(), min_new_tokens=20, max_new_tokens=20)
print(tokenizer.decode(output[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
/usr/local/lib/python3.10/dist-packages/torch/utils/cpp_extension.py:1965: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
warnings.warn(
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:20: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch .library.impl_abstract("aqlm::code1x16_matmat")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:33: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch .library.impl_abstract("aqlm::code1x16_matmat_dequant")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:48: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch .library.impl_abstract("aqlm::code1x16_matmat_dequant_transposed")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:62: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch .library.impl_abstract("aqlm::code2x8_matmat")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:75: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch .library.impl_abstract("aqlm::code2x8_matmat_dequant")
/usr/local/lib/python3.10/dist-packages/aqlm/inference_kernels/cuda_kernel.py:88: FutureWarning: torch.library.impl_abstract was renamed to torch.library.register_fake. Please use that instead; we will remove torch.library.impl_abstract in a future version of PyTorch.
@torch .library.impl_abstract("aqlm::code2x8_matmat_dequant_transposed")
<|begin_of_text|>What is the result 1+1=؟vetica uncon uncon uncon uncon uncon uncon uncon uncon uncon uncon uncon uncon uncon

from transformers import AutoTokenizer, AutoModelForCausalLM

quantized_model = AutoModelForCausalLM.from_pretrained(
"ISTA-DASLab/Meta-Llama-3-70B-AQLM-PV-1Bit-1x16",
torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True,
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-70B")

output = quantized_model.generate(tokenizer("1+1=؟", return_tensors="pt")["input_ids"].cuda(), min_new_tokens=20, max_new_tokens=20)
print(tokenizer.decode(output[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:128001 for open-end generation.
<|begin_of_text|>1+1=؟586774774774774774774774774774774774774774774774774774774774

What is the solution for the incomprehensible model outputs even with changing the token?

from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Meta-Llama-3-70B-AQLM-PV-1Bit-1x16")
model = AutoModelForCausalLM.from_pretrained(
"ISTA-DASLab/Meta-Llama-3-70B-AQLM-PV-1Bit-1x16", device_map="auto"
)

prompt = """How many helicopters can a human eat in one sitting? Reply as a thug."""
model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
input_length = model_inputs.input_ids.shape[1]
generated_ids = model.generate(**model_inputs, max_new_tokens=20)
print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])

Flores Flores Flores Flores Flores Flores Flores Floresigneigneigneigneigneigneigneigneigneigneigneigne

baaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaad

from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-70B")
model = AutoModelForCausalLM.from_pretrained(
"ISTA-DASLab/Meta-Llama-3-70B-AQLM-PV-1Bit-1x16", device_map="auto"
)

prompt = """How many helicopters can a human eat in one sitting? Reply as a thug."""
model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
input_length = model_inputs.input_ids.shape[1]
generated_ids = model.generate(**model_inputs, max_new_tokens=20)
print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])

Clyveticaveticaveticaveticaveticaveticaveticaveticaveticaveticaveticaveticaveticaveticaveticaveticaveticaveticavetica

Sign up or log in to comment