Spaces:
Sleeping
Sleeping
christopher
commited on
Commit
•
3a67ca3
1
Parent(s):
1bc48d2
Initial commit
Browse files- README.md +0 -2
- tokens_per_byte.py +7 -7
README.md
CHANGED
@@ -1,7 +1,5 @@
|
|
1 |
---
|
2 |
title: Tokens per Byte
|
3 |
-
datasets:
|
4 |
-
-
|
5 |
tags:
|
6 |
- evaluate
|
7 |
- measurement
|
|
|
1 |
---
|
2 |
title: Tokens per Byte
|
|
|
|
|
3 |
tags:
|
4 |
- evaluate
|
5 |
- measurement
|
tokens_per_byte.py
CHANGED
@@ -71,8 +71,7 @@ class TokensperByte(evaluate.Measurement):
|
|
71 |
inputs_description=_KWARGS_DESCRIPTION,
|
72 |
# This defines the format of each prediction and reference
|
73 |
features=datasets.Features({
|
74 |
-
'
|
75 |
-
'references': datasets.Value('int64'),
|
76 |
}),
|
77 |
# Homepage of the module for documentation
|
78 |
homepage="http://module.homepage",
|
@@ -86,10 +85,11 @@ class TokensperByte(evaluate.Measurement):
|
|
86 |
# TODO: Download external resources if needed
|
87 |
pass
|
88 |
|
89 |
-
def _compute(self,
|
90 |
"""Returns the scores"""
|
91 |
-
|
92 |
-
|
|
|
93 |
return {
|
94 |
-
"
|
95 |
-
}
|
|
|
71 |
inputs_description=_KWARGS_DESCRIPTION,
|
72 |
# This defines the format of each prediction and reference
|
73 |
features=datasets.Features({
|
74 |
+
'text':datasets.Value("string"),
|
|
|
75 |
}),
|
76 |
# Homepage of the module for documentation
|
77 |
homepage="http://module.homepage",
|
|
|
85 |
# TODO: Download external resources if needed
|
86 |
pass
|
87 |
|
88 |
+
def _compute(self, text, tokenizer):
|
89 |
"""Returns the scores"""
|
90 |
+
num_tokens = sum(tokenizer(text, return_length=True, return_attention_mask=False, add_special_tokens=False, return_token_type_ids=False)["length"])
|
91 |
+
num_bytes = sum([len(s.encode('utf-8')) for s in text])
|
92 |
+
|
93 |
return {
|
94 |
+
"tokens_per_byte": num_tokens / num_bytes,
|
95 |
+
}
|