Elron commited on
Commit
d389578
1 Parent(s): 7cdc7d0

Upload folder using huggingface_hub

Browse files
Files changed (9) hide show
  1. README.md +78 -2
  2. artifact.py +7 -0
  3. card.py +5 -3
  4. collections.py +13 -2
  5. metrics.py +55 -2
  6. operators.py +1 -1
  7. settings_utils.py +1 -1
  8. templates.py +3 -3
  9. version.py +1 -1
README.md CHANGED
@@ -57,10 +57,86 @@ Then launch the ui by running:
57
  unitxt-explore
58
  ```
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  # 🦄 Contributors
61
 
62
  Please install Unitxt from source by:
63
- ```
64
  git clone git@github.com:IBM/unitxt.git
65
  cd unitxt
66
  pip install -e ".[dev]"
@@ -71,7 +147,7 @@ pre-commit install
71
 
72
  If you use Unitxt in your research, please cite our paper:
73
 
74
- ```
75
  @inproceedings{bandel-etal-2024-unitxt,
76
  title = "Unitxt: Flexible, Shareable and Reusable Data Preparation and Evaluation for Generative {AI}",
77
  author = "Bandel, Elron and
 
57
  unitxt-explore
58
  ```
59
 
60
+ # 🦄 Example
61
+
62
+ This is a simple example of running end-to-end evaluation in self contained python code over user data.
63
+
64
+ See more examples in examples subdirectory.
65
+
66
+ ```python
67
+ from unitxt import get_logger
68
+ from unitxt.api import evaluate, load_dataset
69
+ from unitxt.blocks import Task, TaskCard
70
+ from unitxt.inference import HFPipelineBasedInferenceEngine
71
+ from unitxt.loaders import LoadFromDictionary
72
+ from unitxt.templates import InputOutputTemplate, TemplatesDict
73
+ from unitxt.text_utils import print_dict
74
+
75
+ logger = get_logger()
76
+
77
+ # Set up question answer pairs in a dictionary
78
+ data = {
79
+ "test": [
80
+ {"question": "What is the capital of Texas?", "answer": "Austin"},
81
+ {"question": "What is the color of the sky?", "answer": "Blue"},
82
+ ]
83
+ }
84
+
85
+ card = TaskCard(
86
+ # Load the data from the dictionary. Data can be also loaded from HF, CSV files, COS and other sources using different loaders.
87
+ loader=LoadFromDictionary(data=data),
88
+ # Define the QA task input and output and metrics.
89
+ task=Task(
90
+ input_fields={"question": str},
91
+ reference_fields={"answer": str},
92
+ prediction_type=str,
93
+ metrics=["metrics.accuracy"],
94
+ ),
95
+ )
96
+
97
+ # Create a simple template that formats the input.
98
+ # Add lowercase normalization as a post processor on the model prediction.
99
+
100
+ template = InputOutputTemplate(
101
+ instruction="Answer the following question.",
102
+ input_format="{question}",
103
+ output_format="{answer}",
104
+ postprocessors=["processors.lower_case"],
105
+ )
106
+ # Verbalize the dataset using the template
107
+ dataset = load_dataset(card=card, template=template)
108
+ test_dataset = dataset["test"]
109
+
110
+
111
+ # Infer using flan t5 base using HF API
112
+ # can be replaced with any prediction code,
113
+ # including the built in WMLInferenceEngine and OpenAiInferenceEngine.
114
+ model_name = "google/flan-t5-base"
115
+ inference_model = HFPipelineBasedInferenceEngine(
116
+ model_name=model_name, max_new_tokens=32
117
+ )
118
+ predictions = inference_model.infer(test_dataset)
119
+ evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)
120
+
121
+ # Print results
122
+ for instance in evaluated_dataset:
123
+ print_dict(
124
+ instance,
125
+ keys_to_print=[
126
+ "source", # input to the model
127
+ "prediction", # model prediction
128
+ "processed_prediction", # model prediction after post processing
129
+ "references", # reference answer
130
+ "score", # scores (per instance and global)
131
+ ],
132
+ )
133
+
134
+ ```
135
+
136
  # 🦄 Contributors
137
 
138
  Please install Unitxt from source by:
139
+ ```bash
140
  git clone git@github.com:IBM/unitxt.git
141
  cd unitxt
142
  pip install -e ".[dev]"
 
147
 
148
  If you use Unitxt in your research, please cite our paper:
149
 
150
+ ```bib
151
  @inproceedings{bandel-etal-2024-unitxt,
152
  title = "Unitxt: Flexible, Shareable and Reusable Data Preparation and Evaluation for Generative {AI}",
153
  author = "Bandel, Elron and
artifact.py CHANGED
@@ -295,6 +295,13 @@ class Artifact(Dataclass):
295
  **self.process_data_before_dump(self._init_dict),
296
  }
297
 
 
 
 
 
 
 
 
298
  def process_data_before_dump(self, data):
299
  return data
300
 
 
295
  **self.process_data_before_dump(self._init_dict),
296
  }
297
 
298
+ def __deepcopy__(self, memo):
299
+ if id(self) in memo:
300
+ return memo[id(self)]
301
+ new_obj = Artifact.from_dict(self.to_dict())
302
+ memo[id(self)] = new_obj
303
+ return new_obj
304
+
305
  def process_data_before_dump(self, data):
306
  return data
307
 
card.py CHANGED
@@ -1,12 +1,12 @@
1
- from typing import List
2
 
3
  from .artifact import Artifact
4
- from .collections import Collection
5
  from .dataclass import OptionalField
6
  from .loaders import Loader
7
  from .operator import StreamingOperator
8
  from .splitters import RandomSampler, Sampler
9
  from .task import Task
 
10
 
11
 
12
  class TaskCard(Artifact):
@@ -25,5 +25,7 @@ class TaskCard(Artifact):
25
  loader: Loader
26
  preprocess_steps: List[StreamingOperator] = None
27
  task: Task
28
- templates: Collection = None
 
 
29
  sampler: Sampler = OptionalField(default_factory=RandomSampler)
 
1
+ from typing import Dict, List, Union
2
 
3
  from .artifact import Artifact
 
4
  from .dataclass import OptionalField
5
  from .loaders import Loader
6
  from .operator import StreamingOperator
7
  from .splitters import RandomSampler, Sampler
8
  from .task import Task
9
+ from .templates import Template, TemplatesDict, TemplatesList
10
 
11
 
12
  class TaskCard(Artifact):
 
25
  loader: Loader
26
  preprocess_steps: List[StreamingOperator] = None
27
  task: Task
28
+ templates: Union[
29
+ TemplatesDict, TemplatesList, Dict[str, Template], List[Template]
30
+ ] = None
31
  sampler: Sampler = OptionalField(default_factory=RandomSampler)
collections.py CHANGED
@@ -1,7 +1,8 @@
1
  import random
2
  import typing
 
3
  from dataclasses import field
4
- from typing import Dict, List
5
 
6
  from .artifact import Artifact
7
  from .dataclass import AbstractField
@@ -11,12 +12,16 @@ from .random_utils import new_random_generator
11
  class Collection(Artifact):
12
  items: typing.Collection = AbstractField()
13
 
14
- def __getitem__(self, key):
15
  try:
16
  return self.items[key]
17
  except LookupError as e:
18
  raise LookupError(f"Cannot find item {key!r} in {self!r}") from e
19
 
 
 
 
 
20
 
21
  class ListCollection(Collection):
22
  items: List[Artifact] = field(default_factory=list)
@@ -33,10 +38,16 @@ class ListCollection(Collection):
33
  def __add__(self, other):
34
  return ListCollection(self.items.__add__(other.items))
35
 
 
 
 
36
 
37
  class DictCollection(Collection):
38
  items: Dict[str, Artifact] = field(default_factory=dict)
39
 
 
 
 
40
 
41
  class ItemPicker(Artifact):
42
  item: object = None
 
1
  import random
2
  import typing
3
+ from abc import abstractmethod
4
  from dataclasses import field
5
+ from typing import Any, Dict, Hashable, List
6
 
7
  from .artifact import Artifact
8
  from .dataclass import AbstractField
 
12
  class Collection(Artifact):
13
  items: typing.Collection = AbstractField()
14
 
15
+ def __getitem__(self, key: Hashable) -> Any:
16
  try:
17
  return self.items[key]
18
  except LookupError as e:
19
  raise LookupError(f"Cannot find item {key!r} in {self!r}") from e
20
 
21
+ @abstractmethod
22
+ def keys(self) -> List[Hashable]:
23
+ pass
24
+
25
 
26
  class ListCollection(Collection):
27
  items: List[Artifact] = field(default_factory=list)
 
38
  def __add__(self, other):
39
  return ListCollection(self.items.__add__(other.items))
40
 
41
+ def keys(self) -> List[int]:
42
+ return list(range(len(self)))
43
+
44
 
45
  class DictCollection(Collection):
46
  items: Dict[str, Artifact] = field(default_factory=dict)
47
 
48
+ def keys(self) -> List[Hashable]:
49
+ return list(self.items.keys())
50
+
51
 
52
  class ItemPicker(Artifact):
53
  item: object = None
metrics.py CHANGED
@@ -1310,6 +1310,59 @@ class Accuracy(InstanceMetric):
1310
  return result
1311
 
1312
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1313
  class JaccardIndex(InstanceMetric):
1314
  reduction_map = {"mean": ["jaccard_index"]}
1315
  main_score = "jaccard_index"
@@ -4741,7 +4794,7 @@ class F1Strings(InstanceMetric):
4741
  main_score = "f1_strings"
4742
  reduction_map = {"mean": ["f1_strings"]}
4743
  prediction_type = str
4744
- single_reference_per_prediction = True
4745
  _requirements_list = {
4746
  "spacy": "Please pip install spacy",
4747
  }
@@ -4764,7 +4817,7 @@ class F1Strings(InstanceMetric):
4764
  prediction: str,
4765
  task_data: List[Dict],
4766
  ) -> dict:
4767
- doc_ref = self.nlp(references[0])
4768
  set_ref = Counter([token.text.lower() for token in doc_ref])
4769
  doc_pred = self.nlp(prediction)
4770
  set_pred = Counter([token.text.lower() for token in doc_pred])
 
1310
  return result
1311
 
1312
 
1313
+ class ANLS(InstanceMetric):
1314
+ main_score = "anls"
1315
+ reduction_map = {"mean": ["anls"]}
1316
+ prediction_type = Any # string representation is compared
1317
+
1318
+ def compute(
1319
+ self,
1320
+ references: List[Any],
1321
+ prediction: Any,
1322
+ task_data: List[Dict],
1323
+ threshold=1.0,
1324
+ ) -> dict:
1325
+ """ANLS image-text accuracy metric."""
1326
+ values = []
1327
+ for answer in references:
1328
+ # preprocess both the answers - gt and prediction
1329
+ gt_answer = " ".join(answer.strip().lower().split())
1330
+ det_answer = " ".join(prediction.strip().lower().split())
1331
+
1332
+ # dist = levenshtein_distance(answer.lower(), detObject['answer'].lower())
1333
+ dist = self.levenshtein_distance(gt_answer, det_answer)
1334
+ length = max(len(answer.upper()), len(prediction.upper()))
1335
+ values.append(0.0 if length == 0 else float(dist) / float(length))
1336
+
1337
+ question_result = 1.0 - min(values)
1338
+
1339
+ if question_result < threshold:
1340
+ question_result = 0.0
1341
+ result = {}
1342
+ result["score"] = question_result
1343
+ result[self.main_score] = question_result
1344
+ result["score_name"] = self.main_score
1345
+ return result
1346
+
1347
+ @staticmethod
1348
+ def levenshtein_distance(s1, s2):
1349
+ if len(s1) > len(s2):
1350
+ s1, s2 = s2, s1
1351
+
1352
+ distances = range(len(s1) + 1)
1353
+ for i2, c2 in enumerate(s2):
1354
+ distances_ = [i2 + 1]
1355
+ for i1, c1 in enumerate(s1):
1356
+ if c1 == c2:
1357
+ distances_.append(distances[i1])
1358
+ else:
1359
+ distances_.append(
1360
+ 1 + min((distances[i1], distances[i1 + 1], distances_[-1]))
1361
+ )
1362
+ distances = distances_
1363
+ return distances[-1]
1364
+
1365
+
1366
  class JaccardIndex(InstanceMetric):
1367
  reduction_map = {"mean": ["jaccard_index"]}
1368
  main_score = "jaccard_index"
 
4794
  main_score = "f1_strings"
4795
  reduction_map = {"mean": ["f1_strings"]}
4796
  prediction_type = str
4797
+ single_reference_per_prediction = False
4798
  _requirements_list = {
4799
  "spacy": "Please pip install spacy",
4800
  }
 
4817
  prediction: str,
4818
  task_data: List[Dict],
4819
  ) -> dict:
4820
+ doc_ref = self.nlp(" ".join(references))
4821
  set_ref = Counter([token.text.lower() for token in doc_ref])
4822
  doc_pred = self.nlp(prediction)
4823
  set_pred = Counter([token.text.lower() for token in doc_pred])
operators.py CHANGED
@@ -1022,7 +1022,7 @@ class ArtifactFetcherMixin:
1022
  if artifact_identifier not in cls.cache:
1023
  artifact, artifactory = fetch_artifact(artifact_identifier)
1024
  cls.cache[artifact_identifier] = artifact
1025
- return cls.cache[artifact_identifier]
1026
 
1027
 
1028
  class ApplyOperatorsField(InstanceOperator):
 
1022
  if artifact_identifier not in cls.cache:
1023
  artifact, artifactory = fetch_artifact(artifact_identifier)
1024
  cls.cache[artifact_identifier] = artifact
1025
+ return copy.deepcopy(cls.cache[artifact_identifier])
1026
 
1027
 
1028
  class ApplyOperatorsField(InstanceOperator):
settings_utils.py CHANGED
@@ -180,7 +180,7 @@ if Constants.is_uninitilized():
180
  constants.instance_stream = "__INSTANCE_STREAM__"
181
 
182
 
183
- def get_settings():
184
  return Settings()
185
 
186
 
 
180
  constants.instance_stream = "__INSTANCE_STREAM__"
181
 
182
 
183
+ def get_settings() -> Settings:
184
  return Settings()
185
 
186
 
templates.py CHANGED
@@ -4,7 +4,7 @@ from random import random
4
  from typing import Any, Dict, List, Optional, Tuple, Union
5
 
6
  from .artifact import Artifact
7
- from .collections import ListCollection
8
  from .dataclass import NonPositionalField
9
  from .dict_utils import dict_set
10
  from .error_utils import Documentation, UnitxtError
@@ -866,7 +866,7 @@ class TemplatesList(ListCollection):
866
  assert isinstance(template, Template)
867
 
868
 
869
- class TemplatesDict(Dict):
870
  def verify(self):
871
- for _key, template in self.items():
872
  assert isinstance(template, Template)
 
4
  from typing import Any, Dict, List, Optional, Tuple, Union
5
 
6
  from .artifact import Artifact
7
+ from .collections import DictCollection, ListCollection
8
  from .dataclass import NonPositionalField
9
  from .dict_utils import dict_set
10
  from .error_utils import Documentation, UnitxtError
 
866
  assert isinstance(template, Template)
867
 
868
 
869
+ class TemplatesDict(DictCollection):
870
  def verify(self):
871
+ for template in self.items.values():
872
  assert isinstance(template, Template)
version.py CHANGED
@@ -1 +1 @@
1
- version = "1.13.0"
 
1
+ version = "1.13.1"