TheBloke
/

LlongOrca-7B-16K-GPTQ

Text Generation

text-generation-inference

4-bit precision

Model card Files Files and versions Community

TheBloke commited on Sep 8, 2023

Commit

209e719

•

1 Parent(s): ab835a6

Upload README.md

Files changed (1) hide show

README.md +5 -3

README.md CHANGED Viewed

@@ -168,8 +168,8 @@ model_name_or_path = "TheBloke/LlongOrca-7B-16K-GPTQ"
 # To use a different branch, change revision
 # For example: revision="gptq-4bit-32g-actorder_True"
 model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
-                                             torch_dtype=torch.bfloat16,
                                              device_map="auto",
                                              revision="main")
 tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
@@ -186,7 +186,7 @@ prompt_template=f'''<|im_start|>system
 print("\n\n*** Generate:")
 input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
-output = model.generate(inputs=input_ids, temperature=0.7, max_new_tokens=512)
 print(tokenizer.decode(output[0]))
 # Inference can also be done using transformers' pipeline
@@ -197,9 +197,11 @@ pipe = pipeline(
     model=model,
     tokenizer=tokenizer,
     max_new_tokens=512,
     temperature=0.7,
     top_p=0.95,
-    repetition_penalty=1.15
 )
 print(pipe(prompt_template)[0]['generated_text'])

 # To use a different branch, change revision
 # For example: revision="gptq-4bit-32g-actorder_True"
 model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                              device_map="auto",
+                                             trust_remote_code=False,
                                              revision="main")
 tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
 print("\n\n*** Generate:")
 input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
+output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=512)
 print(tokenizer.decode(output[0]))
 # Inference can also be done using transformers' pipeline
     model=model,
     tokenizer=tokenizer,
     max_new_tokens=512,
+    do_sample=True,
     temperature=0.7,
     top_p=0.95,
+    top_k=40,
+    repetition_penalty=1.1
 )
 print(pipe(prompt_template)[0]['generated_text'])