swap-uniba
/

LLaMAntino-2-70b-hf-UltraChat-ITA

Text Generation

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

m-polignano-uniba commited on Mar 14

Commit

8f1b234

•

1 Parent(s): 9135d96

Update README.md

Files changed (1) hide show

README.md +7 -2

README.md CHANGED Viewed

@@ -85,7 +85,12 @@ tokenizer.chat_template =   "{% set ns = namespace(i=0) %}" \
                                 "{% set ns.i = ns.i+1 %}" \
                             "{% endfor %}"
 pipe = transformers.pipeline(model=model,
     device_map="balanced",
@@ -93,7 +98,7 @@ pipe = transformers.pipeline(model=model,
     return_full_text=False,  # langchain expects the full text
     task='text-generation',
     max_new_tokens=512,  # max number of tokens to generate in the output
-    temperature=0.8 #temperature
 )
 messages = [{"role": "user", "content": "Cosa sono i word embeddings?"}]
 text = tokenizer.apply_chat_template(messages, tokenize=False)

                                 "{% set ns.i = ns.i+1 %}" \
                             "{% endfor %}"
+model = AutoModelForCausalLM.from_pretrained(
+    model,
+    torch_dtype=torch.float16,
+    device_map='balanced',
+    use_flash_attention_2=True
+)
 pipe = transformers.pipeline(model=model,
     device_map="balanced",
     return_full_text=False,  # langchain expects the full text
     task='text-generation',
     max_new_tokens=512,  # max number of tokens to generate in the output
+    temperature=0.7 #temperature
 )
 messages = [{"role": "user", "content": "Cosa sono i word embeddings?"}]
 text = tokenizer.apply_chat_template(messages, tokenize=False)