Upload folder using huggingface_hub
Browse files- README.md +78 -2
- artifact.py +7 -0
- card.py +5 -3
- collections.py +13 -2
- metrics.py +55 -2
- operators.py +1 -1
- settings_utils.py +1 -1
- templates.py +3 -3
- version.py +1 -1
README.md
CHANGED
@@ -57,10 +57,86 @@ Then launch the ui by running:
|
|
57 |
unitxt-explore
|
58 |
```
|
59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
# 🦄 Contributors
|
61 |
|
62 |
Please install Unitxt from source by:
|
63 |
-
```
|
64 |
git clone git@github.com:IBM/unitxt.git
|
65 |
cd unitxt
|
66 |
pip install -e ".[dev]"
|
@@ -71,7 +147,7 @@ pre-commit install
|
|
71 |
|
72 |
If you use Unitxt in your research, please cite our paper:
|
73 |
|
74 |
-
```
|
75 |
@inproceedings{bandel-etal-2024-unitxt,
|
76 |
title = "Unitxt: Flexible, Shareable and Reusable Data Preparation and Evaluation for Generative {AI}",
|
77 |
author = "Bandel, Elron and
|
|
|
57 |
unitxt-explore
|
58 |
```
|
59 |
|
60 |
+
# 🦄 Example
|
61 |
+
|
62 |
+
This is a simple example of running end-to-end evaluation in self contained python code over user data.
|
63 |
+
|
64 |
+
See more examples in examples subdirectory.
|
65 |
+
|
66 |
+
```python
|
67 |
+
from unitxt import get_logger
|
68 |
+
from unitxt.api import evaluate, load_dataset
|
69 |
+
from unitxt.blocks import Task, TaskCard
|
70 |
+
from unitxt.inference import HFPipelineBasedInferenceEngine
|
71 |
+
from unitxt.loaders import LoadFromDictionary
|
72 |
+
from unitxt.templates import InputOutputTemplate, TemplatesDict
|
73 |
+
from unitxt.text_utils import print_dict
|
74 |
+
|
75 |
+
logger = get_logger()
|
76 |
+
|
77 |
+
# Set up question answer pairs in a dictionary
|
78 |
+
data = {
|
79 |
+
"test": [
|
80 |
+
{"question": "What is the capital of Texas?", "answer": "Austin"},
|
81 |
+
{"question": "What is the color of the sky?", "answer": "Blue"},
|
82 |
+
]
|
83 |
+
}
|
84 |
+
|
85 |
+
card = TaskCard(
|
86 |
+
# Load the data from the dictionary. Data can be also loaded from HF, CSV files, COS and other sources using different loaders.
|
87 |
+
loader=LoadFromDictionary(data=data),
|
88 |
+
# Define the QA task input and output and metrics.
|
89 |
+
task=Task(
|
90 |
+
input_fields={"question": str},
|
91 |
+
reference_fields={"answer": str},
|
92 |
+
prediction_type=str,
|
93 |
+
metrics=["metrics.accuracy"],
|
94 |
+
),
|
95 |
+
)
|
96 |
+
|
97 |
+
# Create a simple template that formats the input.
|
98 |
+
# Add lowercase normalization as a post processor on the model prediction.
|
99 |
+
|
100 |
+
template = InputOutputTemplate(
|
101 |
+
instruction="Answer the following question.",
|
102 |
+
input_format="{question}",
|
103 |
+
output_format="{answer}",
|
104 |
+
postprocessors=["processors.lower_case"],
|
105 |
+
)
|
106 |
+
# Verbalize the dataset using the template
|
107 |
+
dataset = load_dataset(card=card, template=template)
|
108 |
+
test_dataset = dataset["test"]
|
109 |
+
|
110 |
+
|
111 |
+
# Infer using flan t5 base using HF API
|
112 |
+
# can be replaced with any prediction code,
|
113 |
+
# including the built in WMLInferenceEngine and OpenAiInferenceEngine.
|
114 |
+
model_name = "google/flan-t5-base"
|
115 |
+
inference_model = HFPipelineBasedInferenceEngine(
|
116 |
+
model_name=model_name, max_new_tokens=32
|
117 |
+
)
|
118 |
+
predictions = inference_model.infer(test_dataset)
|
119 |
+
evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)
|
120 |
+
|
121 |
+
# Print results
|
122 |
+
for instance in evaluated_dataset:
|
123 |
+
print_dict(
|
124 |
+
instance,
|
125 |
+
keys_to_print=[
|
126 |
+
"source", # input to the model
|
127 |
+
"prediction", # model prediction
|
128 |
+
"processed_prediction", # model prediction after post processing
|
129 |
+
"references", # reference answer
|
130 |
+
"score", # scores (per instance and global)
|
131 |
+
],
|
132 |
+
)
|
133 |
+
|
134 |
+
```
|
135 |
+
|
136 |
# 🦄 Contributors
|
137 |
|
138 |
Please install Unitxt from source by:
|
139 |
+
```bash
|
140 |
git clone git@github.com:IBM/unitxt.git
|
141 |
cd unitxt
|
142 |
pip install -e ".[dev]"
|
|
|
147 |
|
148 |
If you use Unitxt in your research, please cite our paper:
|
149 |
|
150 |
+
```bib
|
151 |
@inproceedings{bandel-etal-2024-unitxt,
|
152 |
title = "Unitxt: Flexible, Shareable and Reusable Data Preparation and Evaluation for Generative {AI}",
|
153 |
author = "Bandel, Elron and
|
artifact.py
CHANGED
@@ -295,6 +295,13 @@ class Artifact(Dataclass):
|
|
295 |
**self.process_data_before_dump(self._init_dict),
|
296 |
}
|
297 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
298 |
def process_data_before_dump(self, data):
|
299 |
return data
|
300 |
|
|
|
295 |
**self.process_data_before_dump(self._init_dict),
|
296 |
}
|
297 |
|
298 |
+
def __deepcopy__(self, memo):
|
299 |
+
if id(self) in memo:
|
300 |
+
return memo[id(self)]
|
301 |
+
new_obj = Artifact.from_dict(self.to_dict())
|
302 |
+
memo[id(self)] = new_obj
|
303 |
+
return new_obj
|
304 |
+
|
305 |
def process_data_before_dump(self, data):
|
306 |
return data
|
307 |
|
card.py
CHANGED
@@ -1,12 +1,12 @@
|
|
1 |
-
from typing import List
|
2 |
|
3 |
from .artifact import Artifact
|
4 |
-
from .collections import Collection
|
5 |
from .dataclass import OptionalField
|
6 |
from .loaders import Loader
|
7 |
from .operator import StreamingOperator
|
8 |
from .splitters import RandomSampler, Sampler
|
9 |
from .task import Task
|
|
|
10 |
|
11 |
|
12 |
class TaskCard(Artifact):
|
@@ -25,5 +25,7 @@ class TaskCard(Artifact):
|
|
25 |
loader: Loader
|
26 |
preprocess_steps: List[StreamingOperator] = None
|
27 |
task: Task
|
28 |
-
templates:
|
|
|
|
|
29 |
sampler: Sampler = OptionalField(default_factory=RandomSampler)
|
|
|
1 |
+
from typing import Dict, List, Union
|
2 |
|
3 |
from .artifact import Artifact
|
|
|
4 |
from .dataclass import OptionalField
|
5 |
from .loaders import Loader
|
6 |
from .operator import StreamingOperator
|
7 |
from .splitters import RandomSampler, Sampler
|
8 |
from .task import Task
|
9 |
+
from .templates import Template, TemplatesDict, TemplatesList
|
10 |
|
11 |
|
12 |
class TaskCard(Artifact):
|
|
|
25 |
loader: Loader
|
26 |
preprocess_steps: List[StreamingOperator] = None
|
27 |
task: Task
|
28 |
+
templates: Union[
|
29 |
+
TemplatesDict, TemplatesList, Dict[str, Template], List[Template]
|
30 |
+
] = None
|
31 |
sampler: Sampler = OptionalField(default_factory=RandomSampler)
|
collections.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
import random
|
2 |
import typing
|
|
|
3 |
from dataclasses import field
|
4 |
-
from typing import Dict, List
|
5 |
|
6 |
from .artifact import Artifact
|
7 |
from .dataclass import AbstractField
|
@@ -11,12 +12,16 @@ from .random_utils import new_random_generator
|
|
11 |
class Collection(Artifact):
|
12 |
items: typing.Collection = AbstractField()
|
13 |
|
14 |
-
def __getitem__(self, key):
|
15 |
try:
|
16 |
return self.items[key]
|
17 |
except LookupError as e:
|
18 |
raise LookupError(f"Cannot find item {key!r} in {self!r}") from e
|
19 |
|
|
|
|
|
|
|
|
|
20 |
|
21 |
class ListCollection(Collection):
|
22 |
items: List[Artifact] = field(default_factory=list)
|
@@ -33,10 +38,16 @@ class ListCollection(Collection):
|
|
33 |
def __add__(self, other):
|
34 |
return ListCollection(self.items.__add__(other.items))
|
35 |
|
|
|
|
|
|
|
36 |
|
37 |
class DictCollection(Collection):
|
38 |
items: Dict[str, Artifact] = field(default_factory=dict)
|
39 |
|
|
|
|
|
|
|
40 |
|
41 |
class ItemPicker(Artifact):
|
42 |
item: object = None
|
|
|
1 |
import random
|
2 |
import typing
|
3 |
+
from abc import abstractmethod
|
4 |
from dataclasses import field
|
5 |
+
from typing import Any, Dict, Hashable, List
|
6 |
|
7 |
from .artifact import Artifact
|
8 |
from .dataclass import AbstractField
|
|
|
12 |
class Collection(Artifact):
|
13 |
items: typing.Collection = AbstractField()
|
14 |
|
15 |
+
def __getitem__(self, key: Hashable) -> Any:
|
16 |
try:
|
17 |
return self.items[key]
|
18 |
except LookupError as e:
|
19 |
raise LookupError(f"Cannot find item {key!r} in {self!r}") from e
|
20 |
|
21 |
+
@abstractmethod
|
22 |
+
def keys(self) -> List[Hashable]:
|
23 |
+
pass
|
24 |
+
|
25 |
|
26 |
class ListCollection(Collection):
|
27 |
items: List[Artifact] = field(default_factory=list)
|
|
|
38 |
def __add__(self, other):
|
39 |
return ListCollection(self.items.__add__(other.items))
|
40 |
|
41 |
+
def keys(self) -> List[int]:
|
42 |
+
return list(range(len(self)))
|
43 |
+
|
44 |
|
45 |
class DictCollection(Collection):
|
46 |
items: Dict[str, Artifact] = field(default_factory=dict)
|
47 |
|
48 |
+
def keys(self) -> List[Hashable]:
|
49 |
+
return list(self.items.keys())
|
50 |
+
|
51 |
|
52 |
class ItemPicker(Artifact):
|
53 |
item: object = None
|
metrics.py
CHANGED
@@ -1310,6 +1310,59 @@ class Accuracy(InstanceMetric):
|
|
1310 |
return result
|
1311 |
|
1312 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1313 |
class JaccardIndex(InstanceMetric):
|
1314 |
reduction_map = {"mean": ["jaccard_index"]}
|
1315 |
main_score = "jaccard_index"
|
@@ -4741,7 +4794,7 @@ class F1Strings(InstanceMetric):
|
|
4741 |
main_score = "f1_strings"
|
4742 |
reduction_map = {"mean": ["f1_strings"]}
|
4743 |
prediction_type = str
|
4744 |
-
single_reference_per_prediction =
|
4745 |
_requirements_list = {
|
4746 |
"spacy": "Please pip install spacy",
|
4747 |
}
|
@@ -4764,7 +4817,7 @@ class F1Strings(InstanceMetric):
|
|
4764 |
prediction: str,
|
4765 |
task_data: List[Dict],
|
4766 |
) -> dict:
|
4767 |
-
doc_ref = self.nlp(references
|
4768 |
set_ref = Counter([token.text.lower() for token in doc_ref])
|
4769 |
doc_pred = self.nlp(prediction)
|
4770 |
set_pred = Counter([token.text.lower() for token in doc_pred])
|
|
|
1310 |
return result
|
1311 |
|
1312 |
|
1313 |
+
class ANLS(InstanceMetric):
|
1314 |
+
main_score = "anls"
|
1315 |
+
reduction_map = {"mean": ["anls"]}
|
1316 |
+
prediction_type = Any # string representation is compared
|
1317 |
+
|
1318 |
+
def compute(
|
1319 |
+
self,
|
1320 |
+
references: List[Any],
|
1321 |
+
prediction: Any,
|
1322 |
+
task_data: List[Dict],
|
1323 |
+
threshold=1.0,
|
1324 |
+
) -> dict:
|
1325 |
+
"""ANLS image-text accuracy metric."""
|
1326 |
+
values = []
|
1327 |
+
for answer in references:
|
1328 |
+
# preprocess both the answers - gt and prediction
|
1329 |
+
gt_answer = " ".join(answer.strip().lower().split())
|
1330 |
+
det_answer = " ".join(prediction.strip().lower().split())
|
1331 |
+
|
1332 |
+
# dist = levenshtein_distance(answer.lower(), detObject['answer'].lower())
|
1333 |
+
dist = self.levenshtein_distance(gt_answer, det_answer)
|
1334 |
+
length = max(len(answer.upper()), len(prediction.upper()))
|
1335 |
+
values.append(0.0 if length == 0 else float(dist) / float(length))
|
1336 |
+
|
1337 |
+
question_result = 1.0 - min(values)
|
1338 |
+
|
1339 |
+
if question_result < threshold:
|
1340 |
+
question_result = 0.0
|
1341 |
+
result = {}
|
1342 |
+
result["score"] = question_result
|
1343 |
+
result[self.main_score] = question_result
|
1344 |
+
result["score_name"] = self.main_score
|
1345 |
+
return result
|
1346 |
+
|
1347 |
+
@staticmethod
|
1348 |
+
def levenshtein_distance(s1, s2):
|
1349 |
+
if len(s1) > len(s2):
|
1350 |
+
s1, s2 = s2, s1
|
1351 |
+
|
1352 |
+
distances = range(len(s1) + 1)
|
1353 |
+
for i2, c2 in enumerate(s2):
|
1354 |
+
distances_ = [i2 + 1]
|
1355 |
+
for i1, c1 in enumerate(s1):
|
1356 |
+
if c1 == c2:
|
1357 |
+
distances_.append(distances[i1])
|
1358 |
+
else:
|
1359 |
+
distances_.append(
|
1360 |
+
1 + min((distances[i1], distances[i1 + 1], distances_[-1]))
|
1361 |
+
)
|
1362 |
+
distances = distances_
|
1363 |
+
return distances[-1]
|
1364 |
+
|
1365 |
+
|
1366 |
class JaccardIndex(InstanceMetric):
|
1367 |
reduction_map = {"mean": ["jaccard_index"]}
|
1368 |
main_score = "jaccard_index"
|
|
|
4794 |
main_score = "f1_strings"
|
4795 |
reduction_map = {"mean": ["f1_strings"]}
|
4796 |
prediction_type = str
|
4797 |
+
single_reference_per_prediction = False
|
4798 |
_requirements_list = {
|
4799 |
"spacy": "Please pip install spacy",
|
4800 |
}
|
|
|
4817 |
prediction: str,
|
4818 |
task_data: List[Dict],
|
4819 |
) -> dict:
|
4820 |
+
doc_ref = self.nlp(" ".join(references))
|
4821 |
set_ref = Counter([token.text.lower() for token in doc_ref])
|
4822 |
doc_pred = self.nlp(prediction)
|
4823 |
set_pred = Counter([token.text.lower() for token in doc_pred])
|
operators.py
CHANGED
@@ -1022,7 +1022,7 @@ class ArtifactFetcherMixin:
|
|
1022 |
if artifact_identifier not in cls.cache:
|
1023 |
artifact, artifactory = fetch_artifact(artifact_identifier)
|
1024 |
cls.cache[artifact_identifier] = artifact
|
1025 |
-
return cls.cache[artifact_identifier]
|
1026 |
|
1027 |
|
1028 |
class ApplyOperatorsField(InstanceOperator):
|
|
|
1022 |
if artifact_identifier not in cls.cache:
|
1023 |
artifact, artifactory = fetch_artifact(artifact_identifier)
|
1024 |
cls.cache[artifact_identifier] = artifact
|
1025 |
+
return copy.deepcopy(cls.cache[artifact_identifier])
|
1026 |
|
1027 |
|
1028 |
class ApplyOperatorsField(InstanceOperator):
|
settings_utils.py
CHANGED
@@ -180,7 +180,7 @@ if Constants.is_uninitilized():
|
|
180 |
constants.instance_stream = "__INSTANCE_STREAM__"
|
181 |
|
182 |
|
183 |
-
def get_settings():
|
184 |
return Settings()
|
185 |
|
186 |
|
|
|
180 |
constants.instance_stream = "__INSTANCE_STREAM__"
|
181 |
|
182 |
|
183 |
+
def get_settings() -> Settings:
|
184 |
return Settings()
|
185 |
|
186 |
|
templates.py
CHANGED
@@ -4,7 +4,7 @@ from random import random
|
|
4 |
from typing import Any, Dict, List, Optional, Tuple, Union
|
5 |
|
6 |
from .artifact import Artifact
|
7 |
-
from .collections import ListCollection
|
8 |
from .dataclass import NonPositionalField
|
9 |
from .dict_utils import dict_set
|
10 |
from .error_utils import Documentation, UnitxtError
|
@@ -866,7 +866,7 @@ class TemplatesList(ListCollection):
|
|
866 |
assert isinstance(template, Template)
|
867 |
|
868 |
|
869 |
-
class TemplatesDict(
|
870 |
def verify(self):
|
871 |
-
for
|
872 |
assert isinstance(template, Template)
|
|
|
4 |
from typing import Any, Dict, List, Optional, Tuple, Union
|
5 |
|
6 |
from .artifact import Artifact
|
7 |
+
from .collections import DictCollection, ListCollection
|
8 |
from .dataclass import NonPositionalField
|
9 |
from .dict_utils import dict_set
|
10 |
from .error_utils import Documentation, UnitxtError
|
|
|
866 |
assert isinstance(template, Template)
|
867 |
|
868 |
|
869 |
+
class TemplatesDict(DictCollection):
|
870 |
def verify(self):
|
871 |
+
for template in self.items.values():
|
872 |
assert isinstance(template, Template)
|
version.py
CHANGED
@@ -1 +1 @@
|
|
1 |
-
version = "1.13.
|
|
|
1 |
+
version = "1.13.1"
|