nielsbantilan commited on Aug 18, 2023

Commit

fccfca7

•

1 Parent(s): a371d23

Upload folder using huggingface_hub

Browse files

Files changed (21) hide show

.gitattributes +8 -0
flyte19klvulo/local_flytekit/cf21fa2a2d01e72e3915dc8f3c6e1b32/00000 +3 -0
flyte7htre9gj/local_flytekit/50076b0566bf12946e1eb1d4be15e838/00000 +3 -0
flytegwz7gead/local_flytekit/b377c904d20fc45d4ae0fe507ae85019/00000 +3 -0
flyteh3d_ydsb/local_flytekit/3656a2bdc0dbe9fdb5a43b6df6e8db08/00000 +3 -0
flytehp3ce2w5/local_flytekit/fbce013b721d69eb9b098cdac3d5c001/00000 +3 -0
flytemk5qri4f/local_flytekit/9836b038dd943755068a02464454db5a/00000 +3 -0
flytepxngzev1/local_flytekit/41e697b21bf163ce6a04ce166aca0549/00000 +3 -0
flytex31ghl6k/local_flytekit/92c9a08444022269ca292e8c396bdd23/00000 +3 -0
pytorch_model-00001-of-00003.bin +1 -1
pytorch_model-00002-of-00003.bin +1 -1
pytorch_model-00003-of-00003.bin +1 -1
tmp1irzl5w5/_remote_module_non_scriptable.py +81 -0
tmpcwd32mn0/_remote_module_non_scriptable.py +81 -0
tmpefjtsdm5/_remote_module_non_scriptable.py +81 -0
tmpftz082d8/__pycache__/_remote_module_non_scriptable.cpython-310.pyc +0 -0
tmpftz082d8/_remote_module_non_scriptable.py +81 -0
tmpj0u3x6ea/_remote_module_non_scriptable.py +81 -0
tmpnmkfim5e/_remote_module_non_scriptable.py +81 -0
trainer_state.json +108 -108
training_args.bin +1 -1

.gitattributes CHANGED Viewed

@@ -49,3 +49,11 @@ flyteraquk0cj/local_flytekit/776069c6405df68fd2755ce257e952ba/00000 filter=lfs d
 flyterpqo54fv/local_flytekit/fd49b76dd3b1ffbc62b1efcef00fd674/00000 filter=lfs diff=lfs merge=lfs -text
 flyteyao8jgm7/local_flytekit/67696dba0a579df645b5b2f987a9e4b9/00000 filter=lfs diff=lfs merge=lfs -text
 flyteyfv3rs04/local_flytekit/65aa521dee1e8da3c795348937da23ed/00000 filter=lfs diff=lfs merge=lfs -text

 flyterpqo54fv/local_flytekit/fd49b76dd3b1ffbc62b1efcef00fd674/00000 filter=lfs diff=lfs merge=lfs -text
 flyteyao8jgm7/local_flytekit/67696dba0a579df645b5b2f987a9e4b9/00000 filter=lfs diff=lfs merge=lfs -text
 flyteyfv3rs04/local_flytekit/65aa521dee1e8da3c795348937da23ed/00000 filter=lfs diff=lfs merge=lfs -text
+flyte19klvulo/local_flytekit/cf21fa2a2d01e72e3915dc8f3c6e1b32/00000 filter=lfs diff=lfs merge=lfs -text
+flyte7htre9gj/local_flytekit/50076b0566bf12946e1eb1d4be15e838/00000 filter=lfs diff=lfs merge=lfs -text
+flytegwz7gead/local_flytekit/b377c904d20fc45d4ae0fe507ae85019/00000 filter=lfs diff=lfs merge=lfs -text
+flyteh3d_ydsb/local_flytekit/3656a2bdc0dbe9fdb5a43b6df6e8db08/00000 filter=lfs diff=lfs merge=lfs -text
+flytehp3ce2w5/local_flytekit/fbce013b721d69eb9b098cdac3d5c001/00000 filter=lfs diff=lfs merge=lfs -text
+flytemk5qri4f/local_flytekit/9836b038dd943755068a02464454db5a/00000 filter=lfs diff=lfs merge=lfs -text
+flytepxngzev1/local_flytekit/41e697b21bf163ce6a04ce166aca0549/00000 filter=lfs diff=lfs merge=lfs -text
+flytex31ghl6k/local_flytekit/92c9a08444022269ca292e8c396bdd23/00000 filter=lfs diff=lfs merge=lfs -text

flyte19klvulo/local_flytekit/cf21fa2a2d01e72e3915dc8f3c6e1b32/00000 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:067772915d011157436dc1ea88cb38756555e25be2d07616d1ee97dfac6e6535
+size 133886409

flyte7htre9gj/local_flytekit/50076b0566bf12946e1eb1d4be15e838/00000 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:067772915d011157436dc1ea88cb38756555e25be2d07616d1ee97dfac6e6535
+size 133886409

flytegwz7gead/local_flytekit/b377c904d20fc45d4ae0fe507ae85019/00000 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:067772915d011157436dc1ea88cb38756555e25be2d07616d1ee97dfac6e6535
+size 133886409

flyteh3d_ydsb/local_flytekit/3656a2bdc0dbe9fdb5a43b6df6e8db08/00000 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:067772915d011157436dc1ea88cb38756555e25be2d07616d1ee97dfac6e6535
+size 133886409

flytehp3ce2w5/local_flytekit/fbce013b721d69eb9b098cdac3d5c001/00000 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:067772915d011157436dc1ea88cb38756555e25be2d07616d1ee97dfac6e6535
+size 133886409

flytemk5qri4f/local_flytekit/9836b038dd943755068a02464454db5a/00000 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:067772915d011157436dc1ea88cb38756555e25be2d07616d1ee97dfac6e6535
+size 133886409

flytepxngzev1/local_flytekit/41e697b21bf163ce6a04ce166aca0549/00000 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:067772915d011157436dc1ea88cb38756555e25be2d07616d1ee97dfac6e6535
+size 133886409

flytex31ghl6k/local_flytekit/92c9a08444022269ca292e8c396bdd23/00000 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:067772915d011157436dc1ea88cb38756555e25be2d07616d1ee97dfac6e6535
+size 133886409

pytorch_model-00001-of-00003.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0edb9f1a102ad6501ee570b17824779e345dad58fa4c0fee69b413296923668b
 size 9877982386

 version https://git-lfs.github.com/spec/v1
+oid sha256:48c13c72d53fa11536598e333f28f673f7e03707ec8a1cc409d323c7c766973b
 size 9877982386

pytorch_model-00002-of-00003.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e0210300fa233e838ff00964ab3e48ed9d867c21001e52949992e0ec55ed3cff
 size 9894793766

 version https://git-lfs.github.com/spec/v1
+oid sha256:0b5853196ea334e2c41c5c5ce5b6113886062a88420ef7c26fe49f1a00db3a66
 size 9894793766

pytorch_model-00003-of-00003.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b9608bedcb9fd77131a6d81629f8caddaa607b616ac9440b5f1b515bb1a705db
 size 7180985861

 version https://git-lfs.github.com/spec/v1
+oid sha256:5adfd4a60606ecf168e9ca0effc61162f1c4c7eade22e01a3fcf8704eacbf578
 size 7180985861

tmp1irzl5w5/_remote_module_non_scriptable.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from typing import *
+import torch
+import torch.distributed.rpc as rpc
+from torch import Tensor
+from torch._jit_internal import Future
+from torch.distributed.rpc import RRef
+from typing import Tuple  # pyre-ignore: unused import
+module_interface_cls = None
+def forward_async(self, *args, **kwargs):
+    args = (self.module_rref, self.device, self.is_device_map_set, *args)
+    kwargs = {**kwargs}
+    return rpc.rpc_async(
+        self.module_rref.owner(),
+        _remote_forward,
+        args,
+        kwargs,
+    )
+def forward(self, *args, **kwargs):
+    args = (self.module_rref, self.device, self.is_device_map_set, *args)
+    kwargs = {**kwargs}
+    ret_fut = rpc.rpc_async(
+        self.module_rref.owner(),
+        _remote_forward,
+        args,
+        kwargs,
+    )
+    return ret_fut.wait()
+_generated_methods = [
+    forward_async,
+    forward,
+]
+def _remote_forward(
+    module_rref: RRef[module_interface_cls], device: str, is_device_map_set: bool, *args, **kwargs):
+    module = module_rref.local_value()
+    device = torch.device(device)
+    if device.type != "cuda":
+        return module.forward(*args, **kwargs)
+    # If the module is on a cuda device,
+    # move any CPU tensor in args or kwargs to the same cuda device.
+    # Since torch script does not support generator expression,
+    # have to use concatenation instead of
+    # ``tuple(i.to(device) if isinstance(i, Tensor) else i for i in *args)``.
+    args = (*args,)
+    out_args: Tuple[()] = ()
+    for arg in args:
+        arg = (arg.to(device),) if isinstance(arg, Tensor) else (arg,)
+        out_args = out_args + arg
+    kwargs = {**kwargs}
+    for k, v in kwargs.items():
+        if isinstance(v, Tensor):
+            kwargs[k] = kwargs[k].to(device)
+    if is_device_map_set:
+        return module.forward(*out_args, **kwargs)
+    # If the device map is empty, then only CPU tensors are allowed to send over wire,
+    # so have to move any GPU tensor to CPU in the output.
+    # Since torch script does not support generator expression,
+    # have to use concatenation instead of
+    # ``tuple(i.cpu() if isinstance(i, Tensor) else i for i in module.forward(*out_args, **kwargs))``.
+    ret: Tuple[()] = ()
+    for i in module.forward(*out_args, **kwargs):
+        i = (i.cpu(),) if isinstance(i, Tensor) else (i,)
+        ret = ret + i
+    return ret

tmpcwd32mn0/_remote_module_non_scriptable.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from typing import *
+import torch
+import torch.distributed.rpc as rpc
+from torch import Tensor
+from torch._jit_internal import Future
+from torch.distributed.rpc import RRef
+from typing import Tuple  # pyre-ignore: unused import
+module_interface_cls = None
+def forward_async(self, *args, **kwargs):
+    args = (self.module_rref, self.device, self.is_device_map_set, *args)
+    kwargs = {**kwargs}
+    return rpc.rpc_async(
+        self.module_rref.owner(),
+        _remote_forward,
+        args,
+        kwargs,
+    )
+def forward(self, *args, **kwargs):
+    args = (self.module_rref, self.device, self.is_device_map_set, *args)
+    kwargs = {**kwargs}
+    ret_fut = rpc.rpc_async(
+        self.module_rref.owner(),
+        _remote_forward,
+        args,
+        kwargs,
+    )
+    return ret_fut.wait()
+_generated_methods = [
+    forward_async,
+    forward,
+]
+def _remote_forward(
+    module_rref: RRef[module_interface_cls], device: str, is_device_map_set: bool, *args, **kwargs):
+    module = module_rref.local_value()
+    device = torch.device(device)
+    if device.type != "cuda":
+        return module.forward(*args, **kwargs)
+    # If the module is on a cuda device,
+    # move any CPU tensor in args or kwargs to the same cuda device.
+    # Since torch script does not support generator expression,
+    # have to use concatenation instead of
+    # ``tuple(i.to(device) if isinstance(i, Tensor) else i for i in *args)``.
+    args = (*args,)
+    out_args: Tuple[()] = ()
+    for arg in args:
+        arg = (arg.to(device),) if isinstance(arg, Tensor) else (arg,)
+        out_args = out_args + arg
+    kwargs = {**kwargs}
+    for k, v in kwargs.items():
+        if isinstance(v, Tensor):
+            kwargs[k] = kwargs[k].to(device)
+    if is_device_map_set:
+        return module.forward(*out_args, **kwargs)
+    # If the device map is empty, then only CPU tensors are allowed to send over wire,
+    # so have to move any GPU tensor to CPU in the output.
+    # Since torch script does not support generator expression,
+    # have to use concatenation instead of
+    # ``tuple(i.cpu() if isinstance(i, Tensor) else i for i in module.forward(*out_args, **kwargs))``.
+    ret: Tuple[()] = ()
+    for i in module.forward(*out_args, **kwargs):
+        i = (i.cpu(),) if isinstance(i, Tensor) else (i,)
+        ret = ret + i
+    return ret

tmpefjtsdm5/_remote_module_non_scriptable.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from typing import *
+import torch
+import torch.distributed.rpc as rpc
+from torch import Tensor
+from torch._jit_internal import Future
+from torch.distributed.rpc import RRef
+from typing import Tuple  # pyre-ignore: unused import
+module_interface_cls = None
+def forward_async(self, *args, **kwargs):
+    args = (self.module_rref, self.device, self.is_device_map_set, *args)
+    kwargs = {**kwargs}
+    return rpc.rpc_async(
+        self.module_rref.owner(),
+        _remote_forward,
+        args,
+        kwargs,
+    )
+def forward(self, *args, **kwargs):
+    args = (self.module_rref, self.device, self.is_device_map_set, *args)
+    kwargs = {**kwargs}
+    ret_fut = rpc.rpc_async(
+        self.module_rref.owner(),
+        _remote_forward,
+        args,
+        kwargs,
+    )
+    return ret_fut.wait()
+_generated_methods = [
+    forward_async,
+    forward,
+]
+def _remote_forward(
+    module_rref: RRef[module_interface_cls], device: str, is_device_map_set: bool, *args, **kwargs):
+    module = module_rref.local_value()
+    device = torch.device(device)
+    if device.type != "cuda":
+        return module.forward(*args, **kwargs)
+    # If the module is on a cuda device,
+    # move any CPU tensor in args or kwargs to the same cuda device.
+    # Since torch script does not support generator expression,
+    # have to use concatenation instead of
+    # ``tuple(i.to(device) if isinstance(i, Tensor) else i for i in *args)``.
+    args = (*args,)
+    out_args: Tuple[()] = ()
+    for arg in args:
+        arg = (arg.to(device),) if isinstance(arg, Tensor) else (arg,)
+        out_args = out_args + arg
+    kwargs = {**kwargs}
+    for k, v in kwargs.items():
+        if isinstance(v, Tensor):
+            kwargs[k] = kwargs[k].to(device)
+    if is_device_map_set:
+        return module.forward(*out_args, **kwargs)
+    # If the device map is empty, then only CPU tensors are allowed to send over wire,
+    # so have to move any GPU tensor to CPU in the output.
+    # Since torch script does not support generator expression,
+    # have to use concatenation instead of
+    # ``tuple(i.cpu() if isinstance(i, Tensor) else i for i in module.forward(*out_args, **kwargs))``.
+    ret: Tuple[()] = ()
+    for i in module.forward(*out_args, **kwargs):
+        i = (i.cpu(),) if isinstance(i, Tensor) else (i,)
+        ret = ret + i
+    return ret

tmpftz082d8/__pycache__/_remote_module_non_scriptable.cpython-310.pyc ADDED Viewed

Binary file (1.5 kB). View file

tmpftz082d8/_remote_module_non_scriptable.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from typing import *
+import torch
+import torch.distributed.rpc as rpc
+from torch import Tensor
+from torch._jit_internal import Future
+from torch.distributed.rpc import RRef
+from typing import Tuple  # pyre-ignore: unused import
+module_interface_cls = None
+def forward_async(self, *args, **kwargs):
+    args = (self.module_rref, self.device, self.is_device_map_set, *args)
+    kwargs = {**kwargs}
+    return rpc.rpc_async(
+        self.module_rref.owner(),
+        _remote_forward,
+        args,
+        kwargs,
+    )
+def forward(self, *args, **kwargs):
+    args = (self.module_rref, self.device, self.is_device_map_set, *args)
+    kwargs = {**kwargs}
+    ret_fut = rpc.rpc_async(
+        self.module_rref.owner(),
+        _remote_forward,
+        args,
+        kwargs,
+    )
+    return ret_fut.wait()
+_generated_methods = [
+    forward_async,
+    forward,
+]
+def _remote_forward(
+    module_rref: RRef[module_interface_cls], device: str, is_device_map_set: bool, *args, **kwargs):
+    module = module_rref.local_value()
+    device = torch.device(device)
+    if device.type != "cuda":
+        return module.forward(*args, **kwargs)
+    # If the module is on a cuda device,
+    # move any CPU tensor in args or kwargs to the same cuda device.
+    # Since torch script does not support generator expression,
+    # have to use concatenation instead of
+    # ``tuple(i.to(device) if isinstance(i, Tensor) else i for i in *args)``.
+    args = (*args,)
+    out_args: Tuple[()] = ()
+    for arg in args:
+        arg = (arg.to(device),) if isinstance(arg, Tensor) else (arg,)
+        out_args = out_args + arg
+    kwargs = {**kwargs}
+    for k, v in kwargs.items():
+        if isinstance(v, Tensor):
+            kwargs[k] = kwargs[k].to(device)
+    if is_device_map_set:
+        return module.forward(*out_args, **kwargs)
+    # If the device map is empty, then only CPU tensors are allowed to send over wire,
+    # so have to move any GPU tensor to CPU in the output.
+    # Since torch script does not support generator expression,
+    # have to use concatenation instead of
+    # ``tuple(i.cpu() if isinstance(i, Tensor) else i for i in module.forward(*out_args, **kwargs))``.
+    ret: Tuple[()] = ()
+    for i in module.forward(*out_args, **kwargs):
+        i = (i.cpu(),) if isinstance(i, Tensor) else (i,)
+        ret = ret + i
+    return ret

tmpj0u3x6ea/_remote_module_non_scriptable.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from typing import *
+import torch
+import torch.distributed.rpc as rpc
+from torch import Tensor
+from torch._jit_internal import Future
+from torch.distributed.rpc import RRef
+from typing import Tuple  # pyre-ignore: unused import
+module_interface_cls = None
+def forward_async(self, *args, **kwargs):
+    args = (self.module_rref, self.device, self.is_device_map_set, *args)
+    kwargs = {**kwargs}
+    return rpc.rpc_async(
+        self.module_rref.owner(),
+        _remote_forward,
+        args,
+        kwargs,
+    )
+def forward(self, *args, **kwargs):
+    args = (self.module_rref, self.device, self.is_device_map_set, *args)
+    kwargs = {**kwargs}
+    ret_fut = rpc.rpc_async(
+        self.module_rref.owner(),
+        _remote_forward,
+        args,
+        kwargs,
+    )
+    return ret_fut.wait()
+_generated_methods = [
+    forward_async,
+    forward,
+]
+def _remote_forward(
+    module_rref: RRef[module_interface_cls], device: str, is_device_map_set: bool, *args, **kwargs):
+    module = module_rref.local_value()
+    device = torch.device(device)
+    if device.type != "cuda":
+        return module.forward(*args, **kwargs)
+    # If the module is on a cuda device,
+    # move any CPU tensor in args or kwargs to the same cuda device.
+    # Since torch script does not support generator expression,
+    # have to use concatenation instead of
+    # ``tuple(i.to(device) if isinstance(i, Tensor) else i for i in *args)``.
+    args = (*args,)
+    out_args: Tuple[()] = ()
+    for arg in args:
+        arg = (arg.to(device),) if isinstance(arg, Tensor) else (arg,)
+        out_args = out_args + arg
+    kwargs = {**kwargs}
+    for k, v in kwargs.items():
+        if isinstance(v, Tensor):
+            kwargs[k] = kwargs[k].to(device)
+    if is_device_map_set:
+        return module.forward(*out_args, **kwargs)
+    # If the device map is empty, then only CPU tensors are allowed to send over wire,
+    # so have to move any GPU tensor to CPU in the output.
+    # Since torch script does not support generator expression,
+    # have to use concatenation instead of
+    # ``tuple(i.cpu() if isinstance(i, Tensor) else i for i in module.forward(*out_args, **kwargs))``.
+    ret: Tuple[()] = ()
+    for i in module.forward(*out_args, **kwargs):
+        i = (i.cpu(),) if isinstance(i, Tensor) else (i,)
+        ret = ret + i
+    return ret

tmpnmkfim5e/_remote_module_non_scriptable.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from typing import *
+import torch
+import torch.distributed.rpc as rpc
+from torch import Tensor
+from torch._jit_internal import Future
+from torch.distributed.rpc import RRef
+from typing import Tuple  # pyre-ignore: unused import
+module_interface_cls = None
+def forward_async(self, *args, **kwargs):
+    args = (self.module_rref, self.device, self.is_device_map_set, *args)
+    kwargs = {**kwargs}
+    return rpc.rpc_async(
+        self.module_rref.owner(),
+        _remote_forward,
+        args,
+        kwargs,
+    )
+def forward(self, *args, **kwargs):
+    args = (self.module_rref, self.device, self.is_device_map_set, *args)
+    kwargs = {**kwargs}
+    ret_fut = rpc.rpc_async(
+        self.module_rref.owner(),
+        _remote_forward,
+        args,
+        kwargs,
+    )
+    return ret_fut.wait()
+_generated_methods = [
+    forward_async,
+    forward,
+]
+def _remote_forward(
+    module_rref: RRef[module_interface_cls], device: str, is_device_map_set: bool, *args, **kwargs):
+    module = module_rref.local_value()
+    device = torch.device(device)
+    if device.type != "cuda":
+        return module.forward(*args, **kwargs)
+    # If the module is on a cuda device,
+    # move any CPU tensor in args or kwargs to the same cuda device.
+    # Since torch script does not support generator expression,
+    # have to use concatenation instead of
+    # ``tuple(i.to(device) if isinstance(i, Tensor) else i for i in *args)``.
+    args = (*args,)
+    out_args: Tuple[()] = ()
+    for arg in args:
+        arg = (arg.to(device),) if isinstance(arg, Tensor) else (arg,)
+        out_args = out_args + arg
+    kwargs = {**kwargs}
+    for k, v in kwargs.items():
+        if isinstance(v, Tensor):
+            kwargs[k] = kwargs[k].to(device)
+    if is_device_map_set:
+        return module.forward(*out_args, **kwargs)
+    # If the device map is empty, then only CPU tensors are allowed to send over wire,
+    # so have to move any GPU tensor to CPU in the output.
+    # Since torch script does not support generator expression,
+    # have to use concatenation instead of
+    # ``tuple(i.cpu() if isinstance(i, Tensor) else i for i in module.forward(*out_args, **kwargs))``.
+    ret: Tuple[()] = ()
+    for i in module.forward(*out_args, **kwargs):
+        i = (i.cpu(),) if isinstance(i, Tensor) else (i,)
+        ret = ret + i
+    return ret

trainer_state.json CHANGED Viewed

@@ -11,610 +11,610 @@
     {
       "epoch": 0.44,
       "learning_rate": 0,
-      "loss": 1.7341,
       "step": 1
     },
     {
       "epoch": 0.89,
       "learning_rate": 0,
-      "loss": 1.7223,
       "step": 2
     },
     {
       "epoch": 1.33,
       "learning_rate": 0,
-      "loss": 1.7608,
       "step": 3
     },
     {
       "epoch": 1.78,
       "learning_rate": 0,
-      "loss": 1.7115,
       "step": 4
     },
     {
       "epoch": 2.22,
       "learning_rate": 0,
-      "loss": 1.7181,
       "step": 5
     },
     {
       "epoch": 2.67,
       "learning_rate": 0,
-      "loss": 1.7022,
       "step": 6
     },
     {
       "epoch": 3.11,
       "learning_rate": 0,
-      "loss": 1.7242,
       "step": 7
     },
     {
       "epoch": 3.56,
       "learning_rate": 0,
-      "loss": 1.7352,
       "step": 8
     },
     {
       "epoch": 4.0,
       "learning_rate": 0,
-      "loss": 1.7181,
       "step": 9
     },
     {
       "epoch": 4.44,
       "learning_rate": 0,
-      "loss": 1.7213,
       "step": 10
     },
     {
       "epoch": 4.89,
       "learning_rate": 0,
-      "loss": 1.6694,
       "step": 11
     },
     {
       "epoch": 5.33,
       "learning_rate": 0,
-      "loss": 1.7046,
       "step": 12
     },
     {
       "epoch": 5.78,
       "learning_rate": 0,
-      "loss": 1.7109,
       "step": 13
     },
     {
       "epoch": 6.22,
       "learning_rate": 0,
-      "loss": 1.6948,
       "step": 14
     },
     {
       "epoch": 6.67,
       "learning_rate": 0,
-      "loss": 1.6816,
       "step": 15
     },
     {
       "epoch": 7.11,
-      "learning_rate": 0.0,
-      "loss": 1.6851,
       "step": 16
     },
     {
       "epoch": 7.56,
-      "learning_rate": 1.2618595071429148e-05,
-      "loss": 1.6041,
       "step": 17
     },
     {
       "epoch": 8.0,
-      "learning_rate": 2e-05,
-      "loss": 1.5208,
       "step": 18
     },
     {
       "epoch": 8.44,
       "learning_rate": 2e-05,
-      "loss": 1.4946,
       "step": 19
     },
     {
       "epoch": 8.89,
       "learning_rate": 2e-05,
-      "loss": 1.492,
       "step": 20
     },
     {
       "epoch": 9.33,
       "learning_rate": 2e-05,
-      "loss": 1.4501,
       "step": 21
     },
     {
       "epoch": 9.78,
       "learning_rate": 2e-05,
-      "loss": 1.1894,
       "step": 22
     },
     {
       "epoch": 10.22,
       "learning_rate": 2e-05,
-      "loss": 1.1437,
       "step": 23
     },
     {
       "epoch": 10.67,
       "learning_rate": 2e-05,
-      "loss": 1.02,
       "step": 24
     },
     {
       "epoch": 11.11,
       "learning_rate": 2e-05,
-      "loss": 0.926,
       "step": 25
     },
     {
       "epoch": 11.56,
       "learning_rate": 2e-05,
-      "loss": 0.7794,
       "step": 26
     },
     {
       "epoch": 12.0,
       "learning_rate": 2e-05,
-      "loss": 0.7719,
       "step": 27
     },
     {
       "epoch": 12.44,
       "learning_rate": 2e-05,
-      "loss": 0.6107,
       "step": 28
     },
     {
       "epoch": 12.89,
       "learning_rate": 2e-05,
-      "loss": 0.633,
       "step": 29
     },
     {
       "epoch": 13.33,
       "learning_rate": 2e-05,
-      "loss": 0.4781,
       "step": 30
     },
     {
       "epoch": 13.78,
       "learning_rate": 2e-05,
-      "loss": 0.4379,
       "step": 31
     },
     {
       "epoch": 14.22,
       "learning_rate": 2e-05,
-      "loss": 0.3391,
       "step": 32
     },
     {
       "epoch": 14.67,
       "learning_rate": 2e-05,
-      "loss": 0.2928,
       "step": 33
     },
     {
       "epoch": 15.11,
       "learning_rate": 2e-05,
-      "loss": 0.2631,
       "step": 34
     },
     {
       "epoch": 15.56,
       "learning_rate": 2e-05,
-      "loss": 0.2399,
       "step": 35
     },
     {
       "epoch": 16.0,
       "learning_rate": 2e-05,
-      "loss": 0.2075,
       "step": 36
     },
     {
       "epoch": 16.44,
       "learning_rate": 2e-05,
-      "loss": 0.186,
       "step": 37
     },
     {
       "epoch": 16.89,
       "learning_rate": 2e-05,
-      "loss": 0.1782,
       "step": 38
     },
     {
       "epoch": 17.33,
       "learning_rate": 2e-05,
-      "loss": 0.144,
       "step": 39
     },
     {
       "epoch": 17.78,
       "learning_rate": 2e-05,
-      "loss": 0.1317,
       "step": 40
     },
     {
       "epoch": 18.22,
       "learning_rate": 2e-05,
-      "loss": 0.1144,
       "step": 41
     },
     {
       "epoch": 18.67,
       "learning_rate": 2e-05,
-      "loss": 0.1193,
       "step": 42
     },
     {
       "epoch": 19.11,
       "learning_rate": 2e-05,
-      "loss": 0.1161,
       "step": 43
     },
     {
       "epoch": 19.56,
       "learning_rate": 2e-05,
-      "loss": 0.0993,
       "step": 44
     },
     {
       "epoch": 20.0,
       "learning_rate": 2e-05,
-      "loss": 0.1083,
       "step": 45
     },
     {
       "epoch": 20.44,
       "learning_rate": 2e-05,
-      "loss": 0.101,
       "step": 46
     },
     {
       "epoch": 20.89,
       "learning_rate": 2e-05,
-      "loss": 0.1013,
       "step": 47
     },
     {
       "epoch": 21.33,
       "learning_rate": 2e-05,
-      "loss": 0.1066,
       "step": 48
     },
     {
       "epoch": 21.78,
       "learning_rate": 2e-05,
-      "loss": 0.1005,
       "step": 49
     },
     {
       "epoch": 22.22,
       "learning_rate": 2e-05,
-      "loss": 0.0882,
       "step": 50
     },
     {
       "epoch": 22.67,
       "learning_rate": 2e-05,
-      "loss": 0.1067,
       "step": 51
     },
     {
       "epoch": 23.11,
       "learning_rate": 2e-05,
-      "loss": 0.0797,
       "step": 52
     },
     {
       "epoch": 23.56,
       "learning_rate": 2e-05,
-      "loss": 0.0943,
       "step": 53
     },
     {
       "epoch": 24.0,
       "learning_rate": 2e-05,
-      "loss": 0.0769,
       "step": 54
     },
     {
       "epoch": 24.44,
       "learning_rate": 2e-05,
-      "loss": 0.0855,
       "step": 55
     },
     {
       "epoch": 24.89,
       "learning_rate": 2e-05,
-      "loss": 0.0735,
       "step": 56
     },
     {
       "epoch": 25.33,
       "learning_rate": 2e-05,
-      "loss": 0.0833,
       "step": 57
     },
     {
       "epoch": 25.78,
       "learning_rate": 2e-05,
-      "loss": 0.0811,
       "step": 58
     },
     {
       "epoch": 26.22,
       "learning_rate": 2e-05,
-      "loss": 0.0772,
       "step": 59
     },
     {
       "epoch": 26.67,
       "learning_rate": 2e-05,
-      "loss": 0.0721,
       "step": 60
     },
     {
       "epoch": 27.11,
       "learning_rate": 2e-05,
-      "loss": 0.0825,
       "step": 61
     },
     {
       "epoch": 27.56,
       "learning_rate": 2e-05,
-      "loss": 0.0758,
       "step": 62
     },
     {
       "epoch": 28.0,
       "learning_rate": 2e-05,
-      "loss": 0.0725,
       "step": 63
     },
     {
       "epoch": 28.44,
       "learning_rate": 2e-05,
-      "loss": 0.077,
       "step": 64
     },
     {
       "epoch": 28.89,
       "learning_rate": 2e-05,
-      "loss": 0.0654,
       "step": 65
     },
     {
       "epoch": 29.33,
       "learning_rate": 2e-05,
-      "loss": 0.0675,
       "step": 66
     },
     {
       "epoch": 29.78,
       "learning_rate": 2e-05,
-      "loss": 0.0772,
       "step": 67
     },
     {
       "epoch": 30.22,
       "learning_rate": 2e-05,
-      "loss": 0.0718,
       "step": 68
     },
     {
       "epoch": 30.67,
       "learning_rate": 2e-05,
-      "loss": 0.0625,
       "step": 69
     },
     {
       "epoch": 31.11,
       "learning_rate": 2e-05,
-      "loss": 0.0616,
       "step": 70
     },
     {
       "epoch": 31.56,
       "learning_rate": 2e-05,
-      "loss": 0.071,
       "step": 71
     },
     {
       "epoch": 32.0,
       "learning_rate": 2e-05,
-      "loss": 0.0655,
       "step": 72
     },
     {
       "epoch": 32.44,
       "learning_rate": 2e-05,
-      "loss": 0.0591,
       "step": 73
     },
     {
       "epoch": 32.89,
       "learning_rate": 2e-05,
-      "loss": 0.0669,
       "step": 74
     },
     {
       "epoch": 33.33,
       "learning_rate": 2e-05,
-      "loss": 0.0653,
       "step": 75
     },
     {
       "epoch": 33.78,
       "learning_rate": 2e-05,
-      "loss": 0.0662,
       "step": 76
     },
     {
       "epoch": 34.22,
       "learning_rate": 2e-05,
-      "loss": 0.0688,
       "step": 77
     },
     {
       "epoch": 34.67,
       "learning_rate": 2e-05,
-      "loss": 0.0498,
       "step": 78
     },
     {
       "epoch": 35.11,
       "learning_rate": 2e-05,
-      "loss": 0.0576,
       "step": 79
     },
     {
       "epoch": 35.56,
       "learning_rate": 2e-05,
-      "loss": 0.0737,
       "step": 80
     },
     {
       "epoch": 36.0,
       "learning_rate": 2e-05,
-      "loss": 0.0609,
       "step": 81
     },
     {
       "epoch": 36.44,
       "learning_rate": 2e-05,
-      "loss": 0.0594,
       "step": 82
     },
     {
       "epoch": 36.89,
       "learning_rate": 2e-05,
-      "loss": 0.0725,
       "step": 83
     },
     {
       "epoch": 37.33,
       "learning_rate": 2e-05,
-      "loss": 0.0598,
       "step": 84
     },
     {
       "epoch": 37.78,
       "learning_rate": 2e-05,
-      "loss": 0.0652,
       "step": 85
     },
     {
       "epoch": 38.22,
       "learning_rate": 2e-05,
-      "loss": 0.0588,
       "step": 86
     },
     {
       "epoch": 38.67,
       "learning_rate": 2e-05,
-      "loss": 0.0671,
       "step": 87
     },
     {
       "epoch": 39.11,
       "learning_rate": 2e-05,
-      "loss": 0.0596,
       "step": 88
     },
     {
       "epoch": 39.56,
       "learning_rate": 2e-05,
-      "loss": 0.0518,
       "step": 89
     },
     {
       "epoch": 40.0,
       "learning_rate": 2e-05,
-      "loss": 0.0612,
       "step": 90
     },
     {
       "epoch": 40.44,
       "learning_rate": 2e-05,
-      "loss": 0.0593,
       "step": 91
     },
     {
       "epoch": 40.89,
       "learning_rate": 2e-05,
-      "loss": 0.0521,
       "step": 92
     },
     {
       "epoch": 41.33,
       "learning_rate": 2e-05,
-      "loss": 0.0536,
       "step": 93
     },
     {
       "epoch": 41.78,
       "learning_rate": 2e-05,
-      "loss": 0.0548,
       "step": 94
     },
     {
       "epoch": 42.22,
       "learning_rate": 2e-05,
-      "loss": 0.0507,
       "step": 95
     },
     {
       "epoch": 42.67,
       "learning_rate": 2e-05,
-      "loss": 0.0588,
       "step": 96
     },
     {
       "epoch": 43.11,
       "learning_rate": 2e-05,
-      "loss": 0.0506,
       "step": 97
     },
     {
       "epoch": 43.56,
       "learning_rate": 2e-05,
-      "loss": 0.055,
       "step": 98
     },
     {
       "epoch": 44.0,
       "learning_rate": 2e-05,
-      "loss": 0.0503,
       "step": 99
     },
     {
       "epoch": 44.44,
       "learning_rate": 2e-05,
-      "loss": 0.054,
       "step": 100
     },
     {
       "epoch": 44.44,
       "step": 100,
-      "total_flos": 7478779576320.0,
-      "train_loss": 0.49326207719743254,
-      "train_runtime": 9902.4306,
-      "train_samples_per_second": 0.969,
       "train_steps_per_second": 0.01
     }
   ],
@@ -622,7 +622,7 @@
   "max_steps": 100,
   "num_train_epochs": 50,
   "save_steps": 200,
-  "total_flos": 7478779576320.0,
   "trial_name": null,
   "trial_params": null
 }

     {
       "epoch": 0.44,
       "learning_rate": 0,
+      "loss": 1.6723,
       "step": 1
     },
     {
       "epoch": 0.89,
       "learning_rate": 0,
+      "loss": 1.7539,
       "step": 2
     },
     {
       "epoch": 1.33,
       "learning_rate": 0,
+      "loss": 1.7347,
       "step": 3
     },
     {
       "epoch": 1.78,
       "learning_rate": 0,
+      "loss": 1.709,
       "step": 4
     },
     {
       "epoch": 2.22,
       "learning_rate": 0,
+      "loss": 1.7275,
       "step": 5
     },
     {
       "epoch": 2.67,
       "learning_rate": 0,
+      "loss": 1.7085,
       "step": 6
     },
     {
       "epoch": 3.11,
       "learning_rate": 0,
+      "loss": 1.7304,
       "step": 7
     },
     {
       "epoch": 3.56,
       "learning_rate": 0,
+      "loss": 1.7121,
       "step": 8
     },
     {
       "epoch": 4.0,
       "learning_rate": 0,
+      "loss": 1.719,
       "step": 9
     },
     {
       "epoch": 4.44,
       "learning_rate": 0,
+      "loss": 1.7356,
       "step": 10
     },
     {
       "epoch": 4.89,
       "learning_rate": 0,
+      "loss": 1.7842,
       "step": 11
     },
     {
       "epoch": 5.33,
       "learning_rate": 0,
+      "loss": 1.7527,
       "step": 12
     },
     {
       "epoch": 5.78,
       "learning_rate": 0,
+      "loss": 1.6973,
       "step": 13
     },
     {
       "epoch": 6.22,
       "learning_rate": 0,
+      "loss": 1.7233,
       "step": 14
     },
     {
       "epoch": 6.67,
       "learning_rate": 0,
+      "loss": 1.7313,
       "step": 15
     },
     {
       "epoch": 7.11,
+      "learning_rate": 0,
+      "loss": 1.6788,
       "step": 16
     },
     {
       "epoch": 7.56,
+      "learning_rate": 0.0,
+      "loss": 1.7022,
       "step": 17
     },
     {
       "epoch": 8.0,
+      "learning_rate": 1.2618595071429148e-05,
+      "loss": 1.6138,
       "step": 18
     },
     {
       "epoch": 8.44,
       "learning_rate": 2e-05,
+      "loss": 1.5552,
       "step": 19
     },
     {
       "epoch": 8.89,
       "learning_rate": 2e-05,
+      "loss": 1.457,
       "step": 20
     },
     {
       "epoch": 9.33,
       "learning_rate": 2e-05,
+      "loss": 1.3525,
       "step": 21
     },
     {
       "epoch": 9.78,
       "learning_rate": 2e-05,
+      "loss": 1.249,
       "step": 22
     },
     {
       "epoch": 10.22,
       "learning_rate": 2e-05,
+      "loss": 1.148,
       "step": 23
     },
     {
       "epoch": 10.67,
       "learning_rate": 2e-05,
+      "loss": 0.9726,
       "step": 24
     },
     {
       "epoch": 11.11,
       "learning_rate": 2e-05,
+      "loss": 0.879,
       "step": 25
     },
     {
       "epoch": 11.56,
       "learning_rate": 2e-05,
+      "loss": 0.761,
       "step": 26
     },
     {
       "epoch": 12.0,
       "learning_rate": 2e-05,
+      "loss": 0.7408,
       "step": 27
     },
     {
       "epoch": 12.44,
       "learning_rate": 2e-05,
+      "loss": 0.6326,
       "step": 28
     },
     {
       "epoch": 12.89,
       "learning_rate": 2e-05,
+      "loss": 0.5798,
       "step": 29
     },
     {
       "epoch": 13.33,
       "learning_rate": 2e-05,
+      "loss": 0.5512,
       "step": 30
     },
     {
       "epoch": 13.78,
       "learning_rate": 2e-05,
+      "loss": 0.4236,
       "step": 31
     },
     {
       "epoch": 14.22,
       "learning_rate": 2e-05,
+      "loss": 0.3581,
       "step": 32
     },
     {
       "epoch": 14.67,
       "learning_rate": 2e-05,
+      "loss": 0.3329,
       "step": 33
     },
     {
       "epoch": 15.11,
       "learning_rate": 2e-05,
+      "loss": 0.2962,
       "step": 34
     },
     {
       "epoch": 15.56,
       "learning_rate": 2e-05,
+      "loss": 0.2572,
       "step": 35
     },
     {
       "epoch": 16.0,
       "learning_rate": 2e-05,
+      "loss": 0.2429,
       "step": 36
     },
     {
       "epoch": 16.44,
       "learning_rate": 2e-05,
+      "loss": 0.191,
       "step": 37
     },
     {
       "epoch": 16.89,
       "learning_rate": 2e-05,
+      "loss": 0.174,
       "step": 38
     },
     {
       "epoch": 17.33,
       "learning_rate": 2e-05,
+      "loss": 0.1721,
       "step": 39
     },
     {
       "epoch": 17.78,
       "learning_rate": 2e-05,
+      "loss": 0.1645,
       "step": 40
     },
     {
       "epoch": 18.22,
       "learning_rate": 2e-05,
+      "loss": 0.1313,
       "step": 41
     },
     {
       "epoch": 18.67,
       "learning_rate": 2e-05,
+      "loss": 0.1186,
       "step": 42
     },
     {
       "epoch": 19.11,
       "learning_rate": 2e-05,
+      "loss": 0.1309,
       "step": 43
     },
     {
       "epoch": 19.56,
       "learning_rate": 2e-05,
+      "loss": 0.1077,
       "step": 44
     },
     {
       "epoch": 20.0,
       "learning_rate": 2e-05,
+      "loss": 0.1156,
       "step": 45
     },
     {
       "epoch": 20.44,
       "learning_rate": 2e-05,
+      "loss": 0.1101,
       "step": 46
     },
     {
       "epoch": 20.89,
       "learning_rate": 2e-05,
+      "loss": 0.0979,
       "step": 47
     },
     {
       "epoch": 21.33,
       "learning_rate": 2e-05,
+      "loss": 0.101,
       "step": 48
     },
     {
       "epoch": 21.78,
       "learning_rate": 2e-05,
+      "loss": 0.1001,
       "step": 49
     },
     {
       "epoch": 22.22,
       "learning_rate": 2e-05,
+      "loss": 0.0894,
       "step": 50
     },
     {
       "epoch": 22.67,
       "learning_rate": 2e-05,
+      "loss": 0.0948,
       "step": 51
     },
     {
       "epoch": 23.11,
       "learning_rate": 2e-05,
+      "loss": 0.0861,
       "step": 52
     },
     {
       "epoch": 23.56,
       "learning_rate": 2e-05,
+      "loss": 0.0895,
       "step": 53
     },
     {
       "epoch": 24.0,
       "learning_rate": 2e-05,
+      "loss": 0.0918,
       "step": 54
     },
     {
       "epoch": 24.44,
       "learning_rate": 2e-05,
+      "loss": 0.0841,
       "step": 55
     },
     {
       "epoch": 24.89,
       "learning_rate": 2e-05,
+      "loss": 0.0756,
       "step": 56
     },
     {
       "epoch": 25.33,
       "learning_rate": 2e-05,
+      "loss": 0.0913,
       "step": 57
     },
     {
       "epoch": 25.78,
       "learning_rate": 2e-05,
+      "loss": 0.0796,
       "step": 58
     },
     {
       "epoch": 26.22,
       "learning_rate": 2e-05,
+      "loss": 0.0816,
       "step": 59
     },
     {
       "epoch": 26.67,
       "learning_rate": 2e-05,
+      "loss": 0.0728,
       "step": 60
     },
     {
       "epoch": 27.11,
       "learning_rate": 2e-05,
+      "loss": 0.0823,
       "step": 61
     },
     {
       "epoch": 27.56,
       "learning_rate": 2e-05,
+      "loss": 0.0798,
       "step": 62
     },
     {
       "epoch": 28.0,
       "learning_rate": 2e-05,
+      "loss": 0.0693,
       "step": 63
     },
     {
       "epoch": 28.44,
       "learning_rate": 2e-05,
+      "loss": 0.0805,
       "step": 64
     },
     {
       "epoch": 28.89,
       "learning_rate": 2e-05,
+      "loss": 0.0685,
       "step": 65
     },
     {
       "epoch": 29.33,
       "learning_rate": 2e-05,
+      "loss": 0.07,
       "step": 66
     },
     {
       "epoch": 29.78,
       "learning_rate": 2e-05,
+      "loss": 0.0779,
       "step": 67
     },
     {
       "epoch": 30.22,
       "learning_rate": 2e-05,
+      "loss": 0.0773,
       "step": 68
     },
     {
       "epoch": 30.67,
       "learning_rate": 2e-05,
+      "loss": 0.0631,
       "step": 69
     },
     {
       "epoch": 31.11,
       "learning_rate": 2e-05,
+      "loss": 0.0656,
       "step": 70
     },
     {
       "epoch": 31.56,
       "learning_rate": 2e-05,
+      "loss": 0.074,
       "step": 71
     },
     {
       "epoch": 32.0,
       "learning_rate": 2e-05,
+      "loss": 0.0651,
       "step": 72
     },
     {
       "epoch": 32.44,
       "learning_rate": 2e-05,
+      "loss": 0.0646,
       "step": 73
     },
     {
       "epoch": 32.89,
       "learning_rate": 2e-05,
+      "loss": 0.0699,
       "step": 74
     },
     {
       "epoch": 33.33,
       "learning_rate": 2e-05,
+      "loss": 0.0578,
       "step": 75
     },
     {
       "epoch": 33.78,
       "learning_rate": 2e-05,
+      "loss": 0.0763,
       "step": 76
     },
     {
       "epoch": 34.22,
       "learning_rate": 2e-05,
+      "loss": 0.0651,
       "step": 77
     },
     {
       "epoch": 34.67,
       "learning_rate": 2e-05,
+      "loss": 0.0565,
       "step": 78
     },
     {
       "epoch": 35.11,
       "learning_rate": 2e-05,
+      "loss": 0.0585,
       "step": 79
     },
     {
       "epoch": 35.56,
       "learning_rate": 2e-05,
+      "loss": 0.069,
       "step": 80
     },
     {
       "epoch": 36.0,
       "learning_rate": 2e-05,
+      "loss": 0.0571,
       "step": 81
     },
     {
       "epoch": 36.44,
       "learning_rate": 2e-05,
+      "loss": 0.0599,
       "step": 82
     },
     {
       "epoch": 36.89,
       "learning_rate": 2e-05,
+      "loss": 0.0639,
       "step": 83
     },
     {
       "epoch": 37.33,
       "learning_rate": 2e-05,
+      "loss": 0.0625,
       "step": 84
     },
     {
       "epoch": 37.78,
       "learning_rate": 2e-05,
+      "loss": 0.0631,
       "step": 85
     },
     {
       "epoch": 38.22,
       "learning_rate": 2e-05,
+      "loss": 0.0552,
       "step": 86
     },
     {
       "epoch": 38.67,
       "learning_rate": 2e-05,
+      "loss": 0.0681,
       "step": 87
     },
     {
       "epoch": 39.11,
       "learning_rate": 2e-05,
+      "loss": 0.0566,
       "step": 88
     },
     {
       "epoch": 39.56,
       "learning_rate": 2e-05,
+      "loss": 0.0594,
       "step": 89
     },
     {
       "epoch": 40.0,
       "learning_rate": 2e-05,
+      "loss": 0.0661,
       "step": 90
     },
     {
       "epoch": 40.44,
       "learning_rate": 2e-05,
+      "loss": 0.0632,
       "step": 91
     },
     {
       "epoch": 40.89,
       "learning_rate": 2e-05,
+      "loss": 0.0529,
       "step": 92
     },
     {
       "epoch": 41.33,
       "learning_rate": 2e-05,
+      "loss": 0.0574,
       "step": 93
     },
     {
       "epoch": 41.78,
       "learning_rate": 2e-05,
+      "loss": 0.055,
       "step": 94
     },
     {
       "epoch": 42.22,
       "learning_rate": 2e-05,
+      "loss": 0.0525,
       "step": 95
     },
     {
       "epoch": 42.67,
       "learning_rate": 2e-05,
+      "loss": 0.0625,
       "step": 96
     },
     {
       "epoch": 43.11,
       "learning_rate": 2e-05,
+      "loss": 0.0462,
       "step": 97
     },
     {
       "epoch": 43.56,
       "learning_rate": 2e-05,
+      "loss": 0.0615,
       "step": 98
     },
     {
       "epoch": 44.0,
       "learning_rate": 2e-05,
+      "loss": 0.0486,
       "step": 99
     },
     {
       "epoch": 44.44,
       "learning_rate": 2e-05,
+      "loss": 0.0539,
       "step": 100
     },
     {
       "epoch": 44.44,
       "step": 100,
+      "total_flos": 7439520890880.0,
+      "train_loss": 0.49884208038449285,
+      "train_runtime": 9870.4378,
+      "train_samples_per_second": 0.973,
       "train_steps_per_second": 0.01
     }
   ],
   "max_steps": 100,
   "num_train_epochs": 50,
   "save_steps": 200,
+  "total_flos": 7439520890880.0,
   "trial_name": null,
   "trial_params": null
 }

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:135858efd4811a09b43593532cb735b8e0bee8450cc74446fdba3f0ec24a504a
 size 6523

 version https://git-lfs.github.com/spec/v1
+oid sha256:d82a4c57a79a807d2d8164061827388f4a5ee4587c5aa26830fa7c388af4e898
 size 6523