File size: 22,700 Bytes
4ea2be3
a667259
4ea2be3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f7a4840
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ea2be3
 
 
b002e98
 
4ea2be3
 
 
 
 
 
 
 
b002e98
 
4ea2be3
82d89e6
4ea2be3
b002e98
4ea2be3
 
0fb098c
 
4ea2be3
b2befb1
 
4ea2be3
 
 
b2befb1
 
4ea2be3
 
 
 
 
 
 
 
 
 
 
 
b002e98
4ea2be3
 
 
 
 
 
 
 
 
 
 
884bc48
 
4ea2be3
 
 
 
3d7b462
3f2f0bb
3d7b462
4ea2be3
4062100
4ea2be3
3f82e31
0624377
 
4ea2be3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2612bf1
4ea2be3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd4ea0a
4ea2be3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd4ea0a
4ea2be3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b2befb1
4ea2be3
 
 
 
b2befb1
4ea2be3
 
 
 
 
 
 
3f2f0bb
4ea2be3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4062100
4ea2be3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8c08aa
 
 
 
 
 
 
 
 
 
 
 
 
4ea2be3
 
 
 
 
 
 
 
 
d8c08aa
4ea2be3
 
 
 
 
5693173
4ea2be3
 
 
 
 
 
 
 
785f462
4ea2be3
 
785f462
 
 
4ea2be3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82d89e6
4ea2be3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f7a4840
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
---
license: llama3.1
datasets:
- HPAI-BSC/Aloe-Beta-General-Collection
- HPAI-BSC/chain-of-diagnosis
- HPAI-BSC/MedS-Ins
- HPAI-BSC/ultramedical
- HPAI-BSC/pubmedqa-cot-llama31
- HPAI-BSC/medqa-cot-llama31
- HPAI-BSC/medmcqa-cot-llama31
- HPAI-BSC/headqa-cot-llama31
- HPAI-BSC/MMLU-medical-cot-llama31
- HPAI-BSC/Polymed-QA
- HPAI-BSC/Aloe-Beta-General-Collection
- HPAI-BSC/Aloe-Beta-General-Collection
language:
- en
library_name: transformers
tags:
- biology
- medical
- healthcare
pipeline_tag: question-answering
model-index:
- name: Llama3.1-Aloe-Beta-8B
  results:
  - task:
      type: text-generation
      name: Text Generation
    dataset:
      name: IFEval (0-Shot)
      type: wis-k/instruction-following-eval
      split: train
      args:
        num_few_shot: 0
    metrics:
    - type: inst_level_strict_acc and prompt_level_strict_acc
      value: 72.53
      name: averaged accuracy
    source:
      url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=HPAI-BSC%2FLlama3.1-Aloe-Beta-8B
      name: Open LLM Leaderboard
  - task:
      type: text-generation
      name: Text Generation
    dataset:
      name: BBH (3-Shot)
      type: SaylorTwift/bbh
      split: test
      args:
        num_few_shot: 3
    metrics:
    - type: acc_norm
      value: 30.37
      name: normalized accuracy
    source:
      url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=HPAI-BSC%2FLlama3.1-Aloe-Beta-8B
      name: Open LLM Leaderboard
  - task:
      type: text-generation
      name: Text Generation
    dataset:
      name: MATH Lvl 5 (4-Shot)
      type: lighteval/MATH-Hard
      split: test
      args:
        num_few_shot: 4
    metrics:
    - type: exact_match
      value: 1.66
      name: exact match
    source:
      url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=HPAI-BSC%2FLlama3.1-Aloe-Beta-8B
      name: Open LLM Leaderboard
  - task:
      type: text-generation
      name: Text Generation
    dataset:
      name: GPQA (0-shot)
      type: Idavidrein/gpqa
      split: train
      args:
        num_few_shot: 0
    metrics:
    - type: acc_norm
      value: 2.46
      name: acc_norm
    source:
      url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=HPAI-BSC%2FLlama3.1-Aloe-Beta-8B
      name: Open LLM Leaderboard
  - task:
      type: text-generation
      name: Text Generation
    dataset:
      name: MuSR (0-shot)
      type: TAUR-Lab/MuSR
      args:
        num_few_shot: 0
    metrics:
    - type: acc_norm
      value: 6.83
      name: acc_norm
    source:
      url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=HPAI-BSC%2FLlama3.1-Aloe-Beta-8B
      name: Open LLM Leaderboard
  - task:
      type: text-generation
      name: Text Generation
    dataset:
      name: MMLU-PRO (5-shot)
      type: TIGER-Lab/MMLU-Pro
      config: main
      split: test
      args:
        num_few_shot: 5
    metrics:
    - type: acc
      value: 28.67
      name: accuracy
    source:
      url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=HPAI-BSC%2FLlama3.1-Aloe-Beta-8B
      name: Open LLM Leaderboard
---
<p align="center">
  <picture>
    <source media="(prefers-color-scheme: dark)" srcset="https://cdn-uploads.huggingface.co/production/uploads/6620f941eba5274b5c12f83d/vg1jG1OgqP7yyE0PO-OMT.png">
    <img alt="prompt_engine" src="https://cdn-uploads.huggingface.co/production/uploads/6620f941eba5274b5c12f83d/vg1jG1OgqP7yyE0PO-OMT.png" width=50%>
  </picture>
</p>
<h1 align="center">
Aloe: A Family of Fine-tuned Open Healthcare LLMs
</h1>

---


Llama3.1-Aloe-Beta-8B is an **open healthcare LLM** achieving **state-of-the-art performance** on several medical tasks. Aloe Beta is made available in two model sizes: [8B](https://huggingface.co/HPAI-BSC/Llama3.1-Aloe-Beta-8B) and [70B](https://huggingface.co/HPAI-BSC/Llama3.1-Aloe-Beta-70B). Both models are trained using the same recipe.

Aloe is trained on 20 medical tasks, resulting in a robust and versatile healthcare model. Evaluations show Aloe models to be among the best in their class. When combined with a RAG system ([also released](https://github.com/HPAI-BSC/prompt_engine)) the 8B version gets close to the performance of closed models like MedPalm-2, GPT4. With the same RAG system, Aloe-Beta-70B outperforms those private alternatives, producing state-of-the-art results.

# Aloe-Beta-8B



![image/png](https://cdn-uploads.huggingface.co/production/uploads/62f7a16192950415b637e201/VUYw4IdANKGrH2VOedwH0.png)

**Aloe-8B-Beta** is the latest iteration in the **Aloe family**, building and improving on the success of its predecessor, [Aloe-8B-Alpha](https://huggingface.co/HPAI-BSC/Llama3-Aloe-8B-Alpha). 
Beta more than triples the training data used by Alpha, for a total of **1.8B tokens**, including a wider variety of medical tasks and instructions (e.g., text summarization, explanation, diagnosis, text classification, treatment recommendation, ...).

![image/png](https://cdn-uploads.huggingface.co/production/uploads/62f7a16192950415b637e201/bCuV5kZUT9H9UECAOWDRc.png)

To mitigate catastrophic forgetting and enable the model to effectively learn new capabilities like **function calling**, we incorporated a diverse set of high-quality general-purpose data constituting 20% of the total training set. The curated data includes some of the highest-quality content available across a range of topics, including mathematics, programming, STEM, and very long instructions (> 8k tokens), to enrich the model's adaptability and comprehension across diverse domains.

Beta also boosts the alignment and safety stages with respect to Alpha. This includes a [medical preference dataset](https://huggingface.co/datasets/TsinghuaC3I/UltraMedical-Preference), as well as the red-teaming dataset (available soon).

Complete training details, model merging configurations, and all training data (including synthetically generated data) can be found below. This includes [the RAG system](https://github.com/HPAI-BSC/prompt_engine) that was developed to test Aloe Beta in a deployment setup. Aloe comes with a healthcare-specific risk assessment to facilitate to the safe use and deployment of such systems.


## Model Details

### [](https://huggingface.co/templates/model-card-example#model-description)Model Description

- **Developed by:**聽[HPAI](https://hpai.bsc.es/)
- **Model type:**聽Causal decoder-only transformer language model
- **Language(s) (NLP):**聽English (capable but not formally evaluated on other languages)
- **License:**聽This model is based on Meta Llama 3.1 8B and is governed by the [Meta Llama 3 License](https://www.llama.com/llama3_1/license/). All our modifications are available with a [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/) license, making the Aloe Beta models **compatible with commercial use**.
- **Base model :** [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B)
- **Paper:** (more coming soon)
- **RAG Repository:**聽https://github.com/HPAI-BSC/prompt_engine

### [](https://huggingface.co/templates/model-card-example#model-sources-optional)Model Sources [optional]

## Model Performance

Aloe Beta has been tested on the most popular healthcare QA datasets, with and without Medprompt inference technique. Results show competitive performance, achieving SOTA within models of the same size.



![image/png](https://cdn-uploads.huggingface.co/production/uploads/6620f941eba5274b5c12f83d/Fny7e0VpBKxXfpA0LBczN.png)

The Beta model has been developed to excel in several different medical tasks. For this reason, we evaluated the model in many different medical tasks:


![image/png](https://cdn-uploads.huggingface.co/production/uploads/6620f941eba5274b5c12f83d/6hi8GcblTxMohXQfA229U.png)

![image/png](https://cdn-uploads.huggingface.co/production/uploads/6620f941eba5274b5c12f83d/Si98VYTJ2eS_gbjJ8FvM2.png)

We also compared the performance of the model in the general domain, using the OpenLLM Leaderboard benchmark. Aloe-Beta gets competitive results with the current SOTA general models in the most used general benchmarks and outperforms the medical models:



![image/png](https://cdn-uploads.huggingface.co/production/uploads/6620f941eba5274b5c12f83d/imK19fzyMUvIJaAbSVnGE.png)

## Uses

### Direct Use

We encourage the use of Aloe for research purposes, as a stepping stone to build better foundational models for healthcare. In production, Aloe should always be used under the supervision of a human expert.

### Out-of-Scope Use

These models are not to be used for clinical practice, medical diagnosis, or any other form of direct or indirect healthcare advice. Models are prone to error and can produce toxic content. The use of Aloe models for activities harmful to individuals, such as spam, fraud, or impersonation, is strictly prohibited. Minors should not be left alone to interact with Aloe without supervision.

## Bias, Risks, and Limitations

Aloe can produce toxic content under the appropriate prompts, and it includes multiple undesirable biases. While significant efforts where conducted to mitigate this (see Alignment details below), model safety cannot be fully guaranteed. We avoid the use of all personal data in our training. 

We identify at least three risk cases specific to healthcare LLMs:
- Healthcare professional impersonation, a fraudulent behaviour which currently generates billions of dollars in [profit](https://www.justice.gov/opa/pr/justice-department-charges-dozens-12-billion-health-care-fraud). A model such as Aloe could be used to increase the efficacy of such deceiving activities, making them more widespread. The main preventive actions are public literacy on the unreliability of digitised information and the importance of medical registration, and legislation enforcing AI-generated content disclaimers. 
- Medical decision-making without professional supervision. While this is already an issue in modern societies (eg self-medication) a model such as Aloe, capable of producing high-quality conversational data, can facilitate self-delusion, particularly in the presence of sycophancy. By producing tailored responses, it can also be used to generate actionable answers. Public literacy on the dangers of self-diagnosis is one of the main defenses, together with the introduction of disclaimers and warnings on the models' outputs. 
- Access to information on dangerous substances or procedures. While the literature on sensitive content can already be found on different sources (eg libraries, the internet, dark web), LLMs can centralize such access, making it nearly impossible to control the flow of such information. Model alignment can help in that regard, but so far the effects remain insufficient, as jailbreaking methods still overcome it.


<!---
Table below shows the performance of Aloe at several AI safety tasks:

TO BE UPDATED

<img src="https://cdn-uploads.huggingface.co/production/uploads/62972c4979f193515da1d38e/T6Jblpf1kmTkM04K716rM.png" width="95%">


We analyzed the safety and robustness of the model using red teaming techniques. We designed a benchmark using different types of attacks and analyzed the performance of Aloe and some extra models, and we confirm that our model is aligned properly and successfully resisting most attacks:


![image/png](https://cdn-uploads.huggingface.co/production/uploads/6620f941eba5274b5c12f83d/KS3yrHan1l1W0cYiXGG-G.png)


![image/png](https://cdn-uploads.huggingface.co/production/uploads/6620f941eba5274b5c12f83d/SYC0qljpLGLmMgx0a623W.png)

-->

## How to Get Started with the Model

Use the code below to get started with the model. You can run conversational inference using the Transformers pipeline abstraction, or by leveraging the Auto classes with the `generate()` function. Let's see examples for both.

#### Transformers pipeline

```python
import transformers
import torch

model_id = "HPAI-BSC/Llama3.1-Aloe-Beta-8B"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

messages = [
    {"role": "system", "content": "You are an expert medical assistant named Aloe, developed by the High Performance Artificial Intelligence Group at Barcelona Supercomputing Center(BSC). You are to be a helpful, respectful, and honest assistant."},
    {"role": "user", "content": "Hello."},
]

prompt = pipeline.tokenizer.apply_chat_template(
		messages, 
		tokenize=False, 
		add_generation_prompt=True
)

terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = pipeline(
    prompt,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
print(outputs[0]["generated_text"][len(prompt):])
```

#### Transformers AutoModelForCausalLM

```python
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_id = "HPAI-BSC/Llama3.1-Aloe-Beta-8B"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

messages = [
    {"role": "system", "content": "You are an expert medical assistant named Aloe, developed by the High Performance Artificial Intelligence Group at Barcelona Supercomputing Center(BSC). You are to be a helpful, respectful, and honest assistant."},
    {"role": "user", "content": "Hello"},
]

input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt"
).to(model.device)

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = model.generate(
    input_ids,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
response = outputs[0][input_ids.shape[-1]:]
print(tokenizer.decode(response, skip_special_tokens=True))
```

## Training Details

### Supervised fine-tuning
SFT on top of Llama 3.1 using axolotl (https://github.com/axolotl-ai-cloud/axolotl).

We used Deepspeed's Zero-3 distributed training using the following hardware:

* 8B: 32x NVIDIA Hopper H100 64GB of the *Marenostrum 5*. 
* 70B: 64x NVIDIA Hopper H100 64GB of the *Marenostrum 5*.


<!---
^^^ TO BE COMPLETED AND DETAILED ^^^
-->



#### Training Data

The training set consists of around 1.8B tokens, having 3 different types of data:

- Medical domain datasets. Includes data from 20 different medical tasks.
  - [HPAI-BSC/Aloe-Beta-General-Collection](https://huggingface.co/datasets/HPAI-BSC/Aloe-Beta-General-Collection)
  - [HPAI-BSC/chain-of-diagnosis](https://huggingface.co/datasets/HPAI-BSC/chain-of-diagnosis)
  - [HPAI-BSC/MedS-Ins](https://huggingface.co/datasets/HPAI-BSC/MedS-Ins)
  - [HPAI-BSC/ultramedica](https://huggingface.co/datasets/HPAI-BSC/ultramedical)
- Synthetic data. We expanded our training data by generating high-quality answers using Llama3.1-70B.
  - [HPAI-BSC/pubmedqa-cot-llama31](https://huggingface.co/datasets/HPAI-BSC/pubmedqa-cot-llama31)
  - [HPAI-BSC/medqa-cot-llama31](https://huggingface.co/datasets/HPAI-BSC/medqa-cot-llama31)
  - [HPAI-BSC/medmcqa-cot-llama31](https://huggingface.co/datasets/HPAI-BSC/medmcqa-cot-llama31)
  - [HPAI-BSC/headqa-cot-llama31](https://huggingface.co/datasets/HPAI-BSC/headqa-cot-llama31)
  - [HPAI-BSC/MMLU-medical-cot-llama31](https://huggingface.co/datasets/HPAI-BSC/MMLU-medical-cot-llama31)
  - [HPAI-BSC/Polymed-QA](https://huggingface.co/datasets/HPAI-BSC/Polymed-QA)
  - Genstruct data (coming soon)
- General data. It includes maths, STEM, code, function calling, and instructions with a very long context.
  - [HPAI-BSC/Aloe-Beta-General-Collection](https://huggingface.co/datasets/HPAI-BSC/Aloe-Beta-General-Collection)

#### Training parameters
- Epochs: 3
- Sequence length: 16384
- Optimizer: adamw_torch
- Learning rate: 2e-5
- Learning rate scheduler: cosine
- Warmup steps: 100
- Weight decay: 0
- Gradient checkpointing
- Zero 3
- Total batch size: 128
- Batch size per device: 1
- Gradient accumulation steps: 4

### Model Merging
The model trained was merged with the Llama-3.1-Instruct model using the DARE_TIES technique. [Mergekit](https://github.com/arcee-ai/mergekit) was used to conduct the merging.

### Model Alignment
The model is aligned using the Direct Preference Optimization (DPO) technique through a two-step process:

1. General DPO Alignment: This step uses a dataset combining medical, general preference, and safety data. We used our dataset [HPAI-BSC/Aloe-Beta-DPO](https://huggingface.co/datasets/HPAI-BSC/Aloe-Beta-DPO). We split the dataset into five parts, and the model was trained iteratively for one epoch on each chunk. We used a learning rate of 2e-7.
2. Red-Teaming Alignment: This step further fine-tunes the model to resist a variety of potential attacks, enhancing its robustness and security. Dataset will be shared soon. In this stage, we set the learning rate to 1e-7.

<!---
^^^ LINKS TO DPO DATA (DPO added, missing the RT^^^
-->


We used [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF) library. We aligned the model using 16x NVIDA HOOPER H100 64GB of the *Marenostrum 5*. Common hyperparameters:

- Sequence length: 4096
- Optimizer: Fused adam
- Total batch size 128
- Batch size per device: 1
- Gradient accumulation steps: 8
- Beta: 0.1



## Evaluation

### Testing Data, Factors & Metrics

#### Testing Data


- [ACI-BENCH](https://github.com/wyim/aci-bench)
- [MTS-Dialog](https://github.com/abachaa/MTS-Dialog)
- [MedText](https://huggingface.co/datasets/BI55/MedText)
- [Medical Text classification](https://www.kaggle.com/datasets/chaitanyakck/medical-text/data)
- [OLAPH](https://github.com/dmis-lab/OLAPH)
- CareQA Open
- [MedDialog](https://huggingface.co/datasets/bigbio/meddialog)
- [MEDIQA QA](https://huggingface.co/datasets/bigbio/mediqa_qa)
- [Meddialog Qsumm](https://huggingface.co/datasets/lighteval/med_dialog)
- [Biored](https://huggingface.co/datasets/YufeiHFUT/BioRED_all_info)
- [MIMIC-III](https://huggingface.co/datasets/dmacres/mimiciii-hospitalcourse-meta)
- [Medical Prescription](https://huggingface.co/datasets/devlocalhost/prescription-full)
- [MedQA (USMLE)](https://huggingface.co/datasets/bigbio/med_qa)
- [MedMCQA](https://huggingface.co/datasets/medmcqa)
- [PubMedQA](https://huggingface.co/datasets/bigbio/pubmed_qa)
- [MMLU-Medical](https://huggingface.co/datasets/lukaemon/mmlu)
- [MedQA-4-Option](https://huggingface.co/datasets/GBaker/MedQA-USMLE-4-options)
- [CareQA](https://huggingface.co/datasets/HPAI-BSC/CareQA)
- [Open LLM Leaderboard 2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)

<!---
^^^ CAREQA Open link MISSING ^^^
-->

#### Metrics

- Accuracy: suite the evaluation of multiple-choice question-answering tasks.
- Rouge1: refers to the overlap of unigrams between the system and the gold standard.


<!---
^^^ MORE METRICS MISSING ^^^
-->

#### Summary

To compare Aloe with the most competitive open models (both general purpose and healthcare-specific) we use popular healthcare datasets (PubMedQA, MedMCQA, MedQA and MMLU for six medical tasks only), together with the new and highly reliable CareQA. However, while MCQA benchmarks provide valuable insights into a model's ability to handle structured queries, they fall short in representing the full range of challenges faced in medical practice. Building upon this idea, Aloe-Beta represents the next step in the evolution of the Aloe Family, designed to broaden the scope beyond the multiple-choice question-answering tasks that defined Aloe-Alpha.


Benchmark results indicate the training conducted on Aloe has boosted its performance above Llama31-8B-Instruct. Llama31-Aloe-Beta-8B  also outperforms other medical models like Llama3-OpenBioLLM and Llama3-Med42. All these results make Llama31-Aloe-8B-Beta the best healthcare LLM of its size.

With the help of prompting techniques the performance of Llama3-Aloe-8B-Beta is significantly improved. Medprompting in particular provides a 7% increase in reported accuracy, after which Llama31-Aloe-8B-Beta only lags behind much bigger models like Llama-3.1-70B-Instruct or MedPalm-2. This improvement is mostly consistent across the OpenLLM Leaderboard and the other medical tasks.

## Environmental Impact

- **Hardware Type:**聽32xH100
- **Hours used (8B):**聽544 GPU hours
- **Hours used (70B):**聽4500 GPU hours
- **Hardware Provider:**聽Barcelona Supercomputing Center (BSC)
- **Compute Region:**聽Spain
- **Carbon Emitted:**聽34.1 kg of CO2

<!---
^^^ ARE CARBON EMISSIONS FOR BOTH? ^^^
-->


## Authors
Aloe Beta has been developed by the [High Performance Artificial Intelligence](https://hpai.bsc.es/) research group, from the [Barcelona Supercomping Center - BSC](https://www.bsc.es/). Main authors are [Jordi Bayarri Planas](https://huggingface.co/JordiBayarri), [Ashwin Kumar Gururajan](https://huggingface.co/G-AshwinKumar) and [Dario Garcia-Gasulla](https://huggingface.co/dariog). Red teaming efforts lead by Adrian Tormos.

mailto:hpai@bsc.es

## Citations


<!---
 Add the prompt engine paper below 
-->

If you use this repository in a published work, please cite the corresponding papers as source:

```
@misc{gururajan2024aloe,
      title={Aloe: A Family of Fine-tuned Open Healthcare LLMs}, 
      author={Ashwin Kumar Gururajan and Enrique Lopez-Cuena and Jordi Bayarri-Planas and Adrian Tormos and Daniel Hinjos and Pablo Bernabeu-Perez and Anna Arias-Duart and Pablo Agustin Martin-Torres and Lucia Urcelay-Ganzabal and Marta Gonzalez-Mallo and Sergio Alvarez-Napagao and Eduard Ayguad茅-Parra and Ulises Cort茅s Dario Garcia-Gasulla},
      year={2024},
      eprint={2405.01886},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}
```
# [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)
Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/HPAI-BSC__Llama3.1-Aloe-Beta-8B-details)!
Summarized results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/contents/viewer/default/train?q=HPAI-BSC%2FLlama3.1-Aloe-Beta-8B&sort[column]=Average%20%E2%AC%86%EF%B8%8F&sort[direction]=desc)!

|      Metric       |Value (%)|
|-------------------|--------:|
|**Average**        |    23.75|
|IFEval (0-Shot)    |    72.53|
|BBH (3-Shot)       |    30.37|
|MATH Lvl 5 (4-Shot)|     1.66|
|GPQA (0-shot)      |     2.46|
|MuSR (0-shot)      |     6.83|
|MMLU-PRO (5-shot)  |    28.67|