Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiosignal-1.3.2.dist-info/INSTALLER +1 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiosignal-1.3.2.dist-info/LICENSE +201 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiosignal-1.3.2.dist-info/METADATA +123 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiosignal-1.3.2.dist-info/RECORD +10 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiosignal-1.3.2.dist-info/WHEEL +6 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiosignal-1.3.2.dist-info/top_level.txt +1 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs/__init__.py +69 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs/__init__.pyi +263 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs/converters.py +3 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs/exceptions.py +3 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs/filters.py +3 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs/py.typed +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs/setters.py +3 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs/validators.py +3 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/__init__.py +79 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/arrow_writer.py +746 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/combine.py +215 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/data_files.py +825 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/fingerprint.py +494 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/hub.py +230 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/info.py +593 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/iterable_dataset.py +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/metric.py +652 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/search.py +785 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/splits.py +635 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/table.py +2422 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/frozenlist/__init__.py +98 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/frozenlist/__init__.pyi +47 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/frozenlist/_frozenlist.pyx +123 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/frozenlist/py.typed +1 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_models.py +516 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_ssl.py +9 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_utils.py +37 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/py.typed +0 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/jinja2-3.1.6.dist-info/INSTALLER +1 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/jinja2-3.1.6.dist-info/METADATA +84 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/jinja2-3.1.6.dist-info/RECORD +57 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/jinja2-3.1.6.dist-info/WHEEL +4 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/jinja2-3.1.6.dist-info/entry_points.txt +3 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml-6.0.2.dist-info/INSTALLER +1 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml-6.0.2.dist-info/METADATA +103 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml-6.0.2.dist-info/RECORD +204 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml-6.0.2.dist-info/WHEEL +6 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml-6.0.2.dist-info/top_level.txt +1 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia_curand_cu12-10.3.9.90.dist-info/INSTALLER +1 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia_curand_cu12-10.3.9.90.dist-info/License.txt +1568 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia_curand_cu12-10.3.9.90.dist-info/METADATA +44 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia_curand_cu12-10.3.9.90.dist-info/RECORD +32 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia_curand_cu12-10.3.9.90.dist-info/WHEEL +5 -0
- Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia_curand_cu12-10.3.9.90.dist-info/top_level.txt +1 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiosignal-1.3.2.dist-info/INSTALLER
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pip
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiosignal-1.3.2.dist-info/LICENSE
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Apache License
|
| 2 |
+
Version 2.0, January 2004
|
| 3 |
+
http://www.apache.org/licenses/
|
| 4 |
+
|
| 5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
| 6 |
+
|
| 7 |
+
1. Definitions.
|
| 8 |
+
|
| 9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
| 10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
| 11 |
+
|
| 12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
| 13 |
+
the copyright owner that is granting the License.
|
| 14 |
+
|
| 15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
| 16 |
+
other entities that control, are controlled by, or are under common
|
| 17 |
+
control with that entity. For the purposes of this definition,
|
| 18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
| 19 |
+
direction or management of such entity, whether by contract or
|
| 20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
| 21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
| 22 |
+
|
| 23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
| 24 |
+
exercising permissions granted by this License.
|
| 25 |
+
|
| 26 |
+
"Source" form shall mean the preferred form for making modifications,
|
| 27 |
+
including but not limited to software source code, documentation
|
| 28 |
+
source, and configuration files.
|
| 29 |
+
|
| 30 |
+
"Object" form shall mean any form resulting from mechanical
|
| 31 |
+
transformation or translation of a Source form, including but
|
| 32 |
+
not limited to compiled object code, generated documentation,
|
| 33 |
+
and conversions to other media types.
|
| 34 |
+
|
| 35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
| 36 |
+
Object form, made available under the License, as indicated by a
|
| 37 |
+
copyright notice that is included in or attached to the work
|
| 38 |
+
(an example is provided in the Appendix below).
|
| 39 |
+
|
| 40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
| 41 |
+
form, that is based on (or derived from) the Work and for which the
|
| 42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
| 43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
| 44 |
+
of this License, Derivative Works shall not include works that remain
|
| 45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
| 46 |
+
the Work and Derivative Works thereof.
|
| 47 |
+
|
| 48 |
+
"Contribution" shall mean any work of authorship, including
|
| 49 |
+
the original version of the Work and any modifications or additions
|
| 50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
| 51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
| 52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
| 53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
| 54 |
+
means any form of electronic, verbal, or written communication sent
|
| 55 |
+
to the Licensor or its representatives, including but not limited to
|
| 56 |
+
communication on electronic mailing lists, source code control systems,
|
| 57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
| 58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
| 59 |
+
excluding communication that is conspicuously marked or otherwise
|
| 60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
| 61 |
+
|
| 62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
| 63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
| 64 |
+
subsequently incorporated within the Work.
|
| 65 |
+
|
| 66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
| 67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
| 70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
| 71 |
+
Work and such Derivative Works in Source or Object form.
|
| 72 |
+
|
| 73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
| 74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 76 |
+
(except as stated in this section) patent license to make, have made,
|
| 77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
| 78 |
+
where such license applies only to those patent claims licensable
|
| 79 |
+
by such Contributor that are necessarily infringed by their
|
| 80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
| 81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
| 82 |
+
institute patent litigation against any entity (including a
|
| 83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
| 84 |
+
or a Contribution incorporated within the Work constitutes direct
|
| 85 |
+
or contributory patent infringement, then any patent licenses
|
| 86 |
+
granted to You under this License for that Work shall terminate
|
| 87 |
+
as of the date such litigation is filed.
|
| 88 |
+
|
| 89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
| 90 |
+
Work or Derivative Works thereof in any medium, with or without
|
| 91 |
+
modifications, and in Source or Object form, provided that You
|
| 92 |
+
meet the following conditions:
|
| 93 |
+
|
| 94 |
+
(a) You must give any other recipients of the Work or
|
| 95 |
+
Derivative Works a copy of this License; and
|
| 96 |
+
|
| 97 |
+
(b) You must cause any modified files to carry prominent notices
|
| 98 |
+
stating that You changed the files; and
|
| 99 |
+
|
| 100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
| 101 |
+
that You distribute, all copyright, patent, trademark, and
|
| 102 |
+
attribution notices from the Source form of the Work,
|
| 103 |
+
excluding those notices that do not pertain to any part of
|
| 104 |
+
the Derivative Works; and
|
| 105 |
+
|
| 106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
| 107 |
+
distribution, then any Derivative Works that You distribute must
|
| 108 |
+
include a readable copy of the attribution notices contained
|
| 109 |
+
within such NOTICE file, excluding those notices that do not
|
| 110 |
+
pertain to any part of the Derivative Works, in at least one
|
| 111 |
+
of the following places: within a NOTICE text file distributed
|
| 112 |
+
as part of the Derivative Works; within the Source form or
|
| 113 |
+
documentation, if provided along with the Derivative Works; or,
|
| 114 |
+
within a display generated by the Derivative Works, if and
|
| 115 |
+
wherever such third-party notices normally appear. The contents
|
| 116 |
+
of the NOTICE file are for informational purposes only and
|
| 117 |
+
do not modify the License. You may add Your own attribution
|
| 118 |
+
notices within Derivative Works that You distribute, alongside
|
| 119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
| 120 |
+
that such additional attribution notices cannot be construed
|
| 121 |
+
as modifying the License.
|
| 122 |
+
|
| 123 |
+
You may add Your own copyright statement to Your modifications and
|
| 124 |
+
may provide additional or different license terms and conditions
|
| 125 |
+
for use, reproduction, or distribution of Your modifications, or
|
| 126 |
+
for any such Derivative Works as a whole, provided Your use,
|
| 127 |
+
reproduction, and distribution of the Work otherwise complies with
|
| 128 |
+
the conditions stated in this License.
|
| 129 |
+
|
| 130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
| 131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
| 132 |
+
by You to the Licensor shall be under the terms and conditions of
|
| 133 |
+
this License, without any additional terms or conditions.
|
| 134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
| 135 |
+
the terms of any separate license agreement you may have executed
|
| 136 |
+
with Licensor regarding such Contributions.
|
| 137 |
+
|
| 138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
| 139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
| 140 |
+
except as required for reasonable and customary use in describing the
|
| 141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
| 142 |
+
|
| 143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
| 144 |
+
agreed to in writing, Licensor provides the Work (and each
|
| 145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
| 146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 147 |
+
implied, including, without limitation, any warranties or conditions
|
| 148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
| 149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
| 150 |
+
appropriateness of using or redistributing the Work and assume any
|
| 151 |
+
risks associated with Your exercise of permissions under this License.
|
| 152 |
+
|
| 153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
| 154 |
+
whether in tort (including negligence), contract, or otherwise,
|
| 155 |
+
unless required by applicable law (such as deliberate and grossly
|
| 156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
| 157 |
+
liable to You for damages, including any direct, indirect, special,
|
| 158 |
+
incidental, or consequential damages of any character arising as a
|
| 159 |
+
result of this License or out of the use or inability to use the
|
| 160 |
+
Work (including but not limited to damages for loss of goodwill,
|
| 161 |
+
work stoppage, computer failure or malfunction, or any and all
|
| 162 |
+
other commercial damages or losses), even if such Contributor
|
| 163 |
+
has been advised of the possibility of such damages.
|
| 164 |
+
|
| 165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
| 166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
| 167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
| 168 |
+
or other liability obligations and/or rights consistent with this
|
| 169 |
+
License. However, in accepting such obligations, You may act only
|
| 170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
| 171 |
+
of any other Contributor, and only if You agree to indemnify,
|
| 172 |
+
defend, and hold each Contributor harmless for any liability
|
| 173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
| 174 |
+
of your accepting any such warranty or additional liability.
|
| 175 |
+
|
| 176 |
+
END OF TERMS AND CONDITIONS
|
| 177 |
+
|
| 178 |
+
APPENDIX: How to apply the Apache License to your work.
|
| 179 |
+
|
| 180 |
+
To apply the Apache License to your work, attach the following
|
| 181 |
+
boilerplate notice, with the fields enclosed by brackets "{}"
|
| 182 |
+
replaced with your own identifying information. (Don't include
|
| 183 |
+
the brackets!) The text should be enclosed in the appropriate
|
| 184 |
+
comment syntax for the file format. We also recommend that a
|
| 185 |
+
file or class name and description of purpose be included on the
|
| 186 |
+
same "printed page" as the copyright notice for easier
|
| 187 |
+
identification within third-party archives.
|
| 188 |
+
|
| 189 |
+
Copyright 2013-2019 Nikolay Kim and Andrew Svetlov
|
| 190 |
+
|
| 191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 192 |
+
you may not use this file except in compliance with the License.
|
| 193 |
+
You may obtain a copy of the License at
|
| 194 |
+
|
| 195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 196 |
+
|
| 197 |
+
Unless required by applicable law or agreed to in writing, software
|
| 198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 200 |
+
See the License for the specific language governing permissions and
|
| 201 |
+
limitations under the License.
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiosignal-1.3.2.dist-info/METADATA
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.1
|
| 2 |
+
Name: aiosignal
|
| 3 |
+
Version: 1.3.2
|
| 4 |
+
Summary: aiosignal: a list of registered asynchronous callbacks
|
| 5 |
+
Home-page: https://github.com/aio-libs/aiosignal
|
| 6 |
+
Maintainer: aiohttp team <team@aiohttp.org>
|
| 7 |
+
Maintainer-email: team@aiohttp.org
|
| 8 |
+
License: Apache 2.0
|
| 9 |
+
Project-URL: Chat: Gitter, https://gitter.im/aio-libs/Lobby
|
| 10 |
+
Project-URL: CI: GitHub Actions, https://github.com/aio-libs/aiosignal/actions
|
| 11 |
+
Project-URL: Coverage: codecov, https://codecov.io/github/aio-libs/aiosignal
|
| 12 |
+
Project-URL: Docs: RTD, https://docs.aiosignal.org
|
| 13 |
+
Project-URL: GitHub: issues, https://github.com/aio-libs/aiosignal/issues
|
| 14 |
+
Project-URL: GitHub: repo, https://github.com/aio-libs/aiosignal
|
| 15 |
+
Classifier: License :: OSI Approved :: Apache Software License
|
| 16 |
+
Classifier: Intended Audience :: Developers
|
| 17 |
+
Classifier: Programming Language :: Python
|
| 18 |
+
Classifier: Programming Language :: Python :: 3
|
| 19 |
+
Classifier: Programming Language :: Python :: 3 :: Only
|
| 20 |
+
Classifier: Development Status :: 5 - Production/Stable
|
| 21 |
+
Classifier: Operating System :: POSIX
|
| 22 |
+
Classifier: Operating System :: MacOS :: MacOS X
|
| 23 |
+
Classifier: Operating System :: Microsoft :: Windows
|
| 24 |
+
Classifier: Framework :: AsyncIO
|
| 25 |
+
Requires-Python: >=3.9
|
| 26 |
+
Description-Content-Type: text/x-rst
|
| 27 |
+
License-File: LICENSE
|
| 28 |
+
Requires-Dist: frozenlist>=1.1.0
|
| 29 |
+
|
| 30 |
+
=========
|
| 31 |
+
aiosignal
|
| 32 |
+
=========
|
| 33 |
+
|
| 34 |
+
.. image:: https://github.com/aio-libs/aiosignal/workflows/CI/badge.svg
|
| 35 |
+
:target: https://github.com/aio-libs/aiosignal/actions?query=workflow%3ACI
|
| 36 |
+
:alt: GitHub status for master branch
|
| 37 |
+
|
| 38 |
+
.. image:: https://codecov.io/gh/aio-libs/aiosignal/branch/master/graph/badge.svg
|
| 39 |
+
:target: https://codecov.io/gh/aio-libs/aiosignal
|
| 40 |
+
:alt: codecov.io status for master branch
|
| 41 |
+
|
| 42 |
+
.. image:: https://badge.fury.io/py/aiosignal.svg
|
| 43 |
+
:target: https://pypi.org/project/aiosignal
|
| 44 |
+
:alt: Latest PyPI package version
|
| 45 |
+
|
| 46 |
+
.. image:: https://readthedocs.org/projects/aiosignal/badge/?version=latest
|
| 47 |
+
:target: https://aiosignal.readthedocs.io/
|
| 48 |
+
:alt: Latest Read The Docs
|
| 49 |
+
|
| 50 |
+
.. image:: https://img.shields.io/discourse/topics?server=https%3A%2F%2Faio-libs.discourse.group%2F
|
| 51 |
+
:target: https://aio-libs.discourse.group/
|
| 52 |
+
:alt: Discourse group for io-libs
|
| 53 |
+
|
| 54 |
+
.. image:: https://badges.gitter.im/Join%20Chat.svg
|
| 55 |
+
:target: https://gitter.im/aio-libs/Lobby
|
| 56 |
+
:alt: Chat on Gitter
|
| 57 |
+
|
| 58 |
+
Introduction
|
| 59 |
+
============
|
| 60 |
+
|
| 61 |
+
A project to manage callbacks in `asyncio` projects.
|
| 62 |
+
|
| 63 |
+
``Signal`` is a list of registered asynchronous callbacks.
|
| 64 |
+
|
| 65 |
+
The signal's life-cycle has two stages: after creation its content
|
| 66 |
+
could be filled by using standard list operations: ``sig.append()``
|
| 67 |
+
etc.
|
| 68 |
+
|
| 69 |
+
After you call ``sig.freeze()`` the signal is *frozen*: adding, removing
|
| 70 |
+
and dropping callbacks is forbidden.
|
| 71 |
+
|
| 72 |
+
The only available operation is calling the previously registered
|
| 73 |
+
callbacks by using ``await sig.send(data)``.
|
| 74 |
+
|
| 75 |
+
For concrete usage examples see the `Signals
|
| 76 |
+
<https://docs.aiohttp.org/en/stable/web_advanced.html#aiohttp-web-signals>
|
| 77 |
+
section of the `Web Server Advanced
|
| 78 |
+
<https://docs.aiohttp.org/en/stable/web_advanced.html>` chapter of the `aiohttp
|
| 79 |
+
documentation`_.
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
Installation
|
| 83 |
+
------------
|
| 84 |
+
|
| 85 |
+
::
|
| 86 |
+
|
| 87 |
+
$ pip install aiosignal
|
| 88 |
+
|
| 89 |
+
The library requires Python 3.8 or newer.
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
Documentation
|
| 93 |
+
=============
|
| 94 |
+
|
| 95 |
+
https://aiosignal.readthedocs.io/
|
| 96 |
+
|
| 97 |
+
Communication channels
|
| 98 |
+
======================
|
| 99 |
+
|
| 100 |
+
*gitter chat* https://gitter.im/aio-libs/Lobby
|
| 101 |
+
|
| 102 |
+
Requirements
|
| 103 |
+
============
|
| 104 |
+
|
| 105 |
+
- Python >= 3.8
|
| 106 |
+
- frozenlist >= 1.0.0
|
| 107 |
+
|
| 108 |
+
License
|
| 109 |
+
=======
|
| 110 |
+
|
| 111 |
+
``aiosignal`` is offered under the Apache 2 license.
|
| 112 |
+
|
| 113 |
+
Source code
|
| 114 |
+
===========
|
| 115 |
+
|
| 116 |
+
The project is hosted on GitHub_
|
| 117 |
+
|
| 118 |
+
Please file an issue in the `bug tracker
|
| 119 |
+
<https://github.com/aio-libs/aiosignal/issues>`_ if you have found a bug
|
| 120 |
+
or have some suggestions to improve the library.
|
| 121 |
+
|
| 122 |
+
.. _GitHub: https://github.com/aio-libs/aiosignal
|
| 123 |
+
.. _aiohttp documentation: https://docs.aiohttp.org/
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiosignal-1.3.2.dist-info/RECORD
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
aiosignal-1.3.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
| 2 |
+
aiosignal-1.3.2.dist-info/LICENSE,sha256=b9UkPpLdf5jsacesN3co50kFcJ_1J6W_mNbQJjwE9bY,11332
|
| 3 |
+
aiosignal-1.3.2.dist-info/METADATA,sha256=TeI_xgZ191qgx37rviEnpMWC0QnYsg_j9EGVivNqqjc,3753
|
| 4 |
+
aiosignal-1.3.2.dist-info/RECORD,,
|
| 5 |
+
aiosignal-1.3.2.dist-info/WHEEL,sha256=pxeNX5JdtCe58PUSYP9upmc7jdRPgvT0Gm9kb1SHlVw,109
|
| 6 |
+
aiosignal-1.3.2.dist-info/top_level.txt,sha256=z45aNOKGDdrI1roqZY3BGXQ22kJFPHBmVdwtLYLtXC0,10
|
| 7 |
+
aiosignal/__init__.py,sha256=1oIrRl6kNpqFh32e7HfMFbMV_35v8sqJJFfnuKgmtEU,867
|
| 8 |
+
aiosignal/__init__.pyi,sha256=xeCddYSS8fZAkz8S4HuKSR2IDe3N7RW_LKcXDPPA1Xk,311
|
| 9 |
+
aiosignal/__pycache__/__init__.cpython-312.pyc,,
|
| 10 |
+
aiosignal/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiosignal-1.3.2.dist-info/WHEEL
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Wheel-Version: 1.0
|
| 2 |
+
Generator: setuptools (75.6.0)
|
| 3 |
+
Root-Is-Purelib: true
|
| 4 |
+
Tag: py2-none-any
|
| 5 |
+
Tag: py3-none-any
|
| 6 |
+
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiosignal-1.3.2.dist-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
aiosignal
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs/__init__.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: MIT
|
| 2 |
+
|
| 3 |
+
from attr import (
|
| 4 |
+
NOTHING,
|
| 5 |
+
Attribute,
|
| 6 |
+
AttrsInstance,
|
| 7 |
+
Converter,
|
| 8 |
+
Factory,
|
| 9 |
+
NothingType,
|
| 10 |
+
_make_getattr,
|
| 11 |
+
assoc,
|
| 12 |
+
cmp_using,
|
| 13 |
+
define,
|
| 14 |
+
evolve,
|
| 15 |
+
field,
|
| 16 |
+
fields,
|
| 17 |
+
fields_dict,
|
| 18 |
+
frozen,
|
| 19 |
+
has,
|
| 20 |
+
make_class,
|
| 21 |
+
mutable,
|
| 22 |
+
resolve_types,
|
| 23 |
+
validate,
|
| 24 |
+
)
|
| 25 |
+
from attr._next_gen import asdict, astuple
|
| 26 |
+
|
| 27 |
+
from . import converters, exceptions, filters, setters, validators
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
__all__ = [
|
| 31 |
+
"NOTHING",
|
| 32 |
+
"Attribute",
|
| 33 |
+
"AttrsInstance",
|
| 34 |
+
"Converter",
|
| 35 |
+
"Factory",
|
| 36 |
+
"NothingType",
|
| 37 |
+
"__author__",
|
| 38 |
+
"__copyright__",
|
| 39 |
+
"__description__",
|
| 40 |
+
"__doc__",
|
| 41 |
+
"__email__",
|
| 42 |
+
"__license__",
|
| 43 |
+
"__title__",
|
| 44 |
+
"__url__",
|
| 45 |
+
"__version__",
|
| 46 |
+
"__version_info__",
|
| 47 |
+
"asdict",
|
| 48 |
+
"assoc",
|
| 49 |
+
"astuple",
|
| 50 |
+
"cmp_using",
|
| 51 |
+
"converters",
|
| 52 |
+
"define",
|
| 53 |
+
"evolve",
|
| 54 |
+
"exceptions",
|
| 55 |
+
"field",
|
| 56 |
+
"fields",
|
| 57 |
+
"fields_dict",
|
| 58 |
+
"filters",
|
| 59 |
+
"frozen",
|
| 60 |
+
"has",
|
| 61 |
+
"make_class",
|
| 62 |
+
"mutable",
|
| 63 |
+
"resolve_types",
|
| 64 |
+
"setters",
|
| 65 |
+
"validate",
|
| 66 |
+
"validators",
|
| 67 |
+
]
|
| 68 |
+
|
| 69 |
+
__getattr__ = _make_getattr(__name__)
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs/__init__.pyi
ADDED
|
@@ -0,0 +1,263 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
|
| 3 |
+
from typing import (
|
| 4 |
+
Any,
|
| 5 |
+
Callable,
|
| 6 |
+
Mapping,
|
| 7 |
+
Sequence,
|
| 8 |
+
overload,
|
| 9 |
+
TypeVar,
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
# Because we need to type our own stuff, we have to make everything from
|
| 13 |
+
# attr explicitly public too.
|
| 14 |
+
from attr import __author__ as __author__
|
| 15 |
+
from attr import __copyright__ as __copyright__
|
| 16 |
+
from attr import __description__ as __description__
|
| 17 |
+
from attr import __email__ as __email__
|
| 18 |
+
from attr import __license__ as __license__
|
| 19 |
+
from attr import __title__ as __title__
|
| 20 |
+
from attr import __url__ as __url__
|
| 21 |
+
from attr import __version__ as __version__
|
| 22 |
+
from attr import __version_info__ as __version_info__
|
| 23 |
+
from attr import assoc as assoc
|
| 24 |
+
from attr import Attribute as Attribute
|
| 25 |
+
from attr import AttrsInstance as AttrsInstance
|
| 26 |
+
from attr import cmp_using as cmp_using
|
| 27 |
+
from attr import converters as converters
|
| 28 |
+
from attr import Converter as Converter
|
| 29 |
+
from attr import evolve as evolve
|
| 30 |
+
from attr import exceptions as exceptions
|
| 31 |
+
from attr import Factory as Factory
|
| 32 |
+
from attr import fields as fields
|
| 33 |
+
from attr import fields_dict as fields_dict
|
| 34 |
+
from attr import filters as filters
|
| 35 |
+
from attr import has as has
|
| 36 |
+
from attr import make_class as make_class
|
| 37 |
+
from attr import NOTHING as NOTHING
|
| 38 |
+
from attr import resolve_types as resolve_types
|
| 39 |
+
from attr import setters as setters
|
| 40 |
+
from attr import validate as validate
|
| 41 |
+
from attr import validators as validators
|
| 42 |
+
from attr import attrib, asdict as asdict, astuple as astuple
|
| 43 |
+
from attr import NothingType as NothingType
|
| 44 |
+
|
| 45 |
+
if sys.version_info >= (3, 11):
|
| 46 |
+
from typing import dataclass_transform
|
| 47 |
+
else:
|
| 48 |
+
from typing_extensions import dataclass_transform
|
| 49 |
+
|
| 50 |
+
_T = TypeVar("_T")
|
| 51 |
+
_C = TypeVar("_C", bound=type)
|
| 52 |
+
|
| 53 |
+
_EqOrderType = bool | Callable[[Any], Any]
|
| 54 |
+
_ValidatorType = Callable[[Any, "Attribute[_T]", _T], Any]
|
| 55 |
+
_CallableConverterType = Callable[[Any], Any]
|
| 56 |
+
_ConverterType = _CallableConverterType | Converter[Any, Any]
|
| 57 |
+
_ReprType = Callable[[Any], str]
|
| 58 |
+
_ReprArgType = bool | _ReprType
|
| 59 |
+
_OnSetAttrType = Callable[[Any, "Attribute[Any]", Any], Any]
|
| 60 |
+
_OnSetAttrArgType = _OnSetAttrType | list[_OnSetAttrType] | setters._NoOpType
|
| 61 |
+
_FieldTransformer = Callable[
|
| 62 |
+
[type, list["Attribute[Any]"]], list["Attribute[Any]"]
|
| 63 |
+
]
|
| 64 |
+
# FIXME: in reality, if multiple validators are passed they must be in a list
|
| 65 |
+
# or tuple, but those are invariant and so would prevent subtypes of
|
| 66 |
+
# _ValidatorType from working when passed in a list or tuple.
|
| 67 |
+
_ValidatorArgType = _ValidatorType[_T] | Sequence[_ValidatorType[_T]]
|
| 68 |
+
|
| 69 |
+
@overload
|
| 70 |
+
def field(
|
| 71 |
+
*,
|
| 72 |
+
default: None = ...,
|
| 73 |
+
validator: None = ...,
|
| 74 |
+
repr: _ReprArgType = ...,
|
| 75 |
+
hash: bool | None = ...,
|
| 76 |
+
init: bool = ...,
|
| 77 |
+
metadata: Mapping[Any, Any] | None = ...,
|
| 78 |
+
converter: None = ...,
|
| 79 |
+
factory: None = ...,
|
| 80 |
+
kw_only: bool = ...,
|
| 81 |
+
eq: bool | None = ...,
|
| 82 |
+
order: bool | None = ...,
|
| 83 |
+
on_setattr: _OnSetAttrArgType | None = ...,
|
| 84 |
+
alias: str | None = ...,
|
| 85 |
+
type: type | None = ...,
|
| 86 |
+
) -> Any: ...
|
| 87 |
+
|
| 88 |
+
# This form catches an explicit None or no default and infers the type from the
|
| 89 |
+
# other arguments.
|
| 90 |
+
@overload
|
| 91 |
+
def field(
|
| 92 |
+
*,
|
| 93 |
+
default: None = ...,
|
| 94 |
+
validator: _ValidatorArgType[_T] | None = ...,
|
| 95 |
+
repr: _ReprArgType = ...,
|
| 96 |
+
hash: bool | None = ...,
|
| 97 |
+
init: bool = ...,
|
| 98 |
+
metadata: Mapping[Any, Any] | None = ...,
|
| 99 |
+
converter: _ConverterType
|
| 100 |
+
| list[_ConverterType]
|
| 101 |
+
| tuple[_ConverterType]
|
| 102 |
+
| None = ...,
|
| 103 |
+
factory: Callable[[], _T] | None = ...,
|
| 104 |
+
kw_only: bool = ...,
|
| 105 |
+
eq: _EqOrderType | None = ...,
|
| 106 |
+
order: _EqOrderType | None = ...,
|
| 107 |
+
on_setattr: _OnSetAttrArgType | None = ...,
|
| 108 |
+
alias: str | None = ...,
|
| 109 |
+
type: type | None = ...,
|
| 110 |
+
) -> _T: ...
|
| 111 |
+
|
| 112 |
+
# This form catches an explicit default argument.
|
| 113 |
+
@overload
|
| 114 |
+
def field(
|
| 115 |
+
*,
|
| 116 |
+
default: _T,
|
| 117 |
+
validator: _ValidatorArgType[_T] | None = ...,
|
| 118 |
+
repr: _ReprArgType = ...,
|
| 119 |
+
hash: bool | None = ...,
|
| 120 |
+
init: bool = ...,
|
| 121 |
+
metadata: Mapping[Any, Any] | None = ...,
|
| 122 |
+
converter: _ConverterType
|
| 123 |
+
| list[_ConverterType]
|
| 124 |
+
| tuple[_ConverterType]
|
| 125 |
+
| None = ...,
|
| 126 |
+
factory: Callable[[], _T] | None = ...,
|
| 127 |
+
kw_only: bool = ...,
|
| 128 |
+
eq: _EqOrderType | None = ...,
|
| 129 |
+
order: _EqOrderType | None = ...,
|
| 130 |
+
on_setattr: _OnSetAttrArgType | None = ...,
|
| 131 |
+
alias: str | None = ...,
|
| 132 |
+
type: type | None = ...,
|
| 133 |
+
) -> _T: ...
|
| 134 |
+
|
| 135 |
+
# This form covers type=non-Type: e.g. forward references (str), Any
|
| 136 |
+
@overload
|
| 137 |
+
def field(
|
| 138 |
+
*,
|
| 139 |
+
default: _T | None = ...,
|
| 140 |
+
validator: _ValidatorArgType[_T] | None = ...,
|
| 141 |
+
repr: _ReprArgType = ...,
|
| 142 |
+
hash: bool | None = ...,
|
| 143 |
+
init: bool = ...,
|
| 144 |
+
metadata: Mapping[Any, Any] | None = ...,
|
| 145 |
+
converter: _ConverterType
|
| 146 |
+
| list[_ConverterType]
|
| 147 |
+
| tuple[_ConverterType]
|
| 148 |
+
| None = ...,
|
| 149 |
+
factory: Callable[[], _T] | None = ...,
|
| 150 |
+
kw_only: bool = ...,
|
| 151 |
+
eq: _EqOrderType | None = ...,
|
| 152 |
+
order: _EqOrderType | None = ...,
|
| 153 |
+
on_setattr: _OnSetAttrArgType | None = ...,
|
| 154 |
+
alias: str | None = ...,
|
| 155 |
+
type: type | None = ...,
|
| 156 |
+
) -> Any: ...
|
| 157 |
+
@overload
|
| 158 |
+
@dataclass_transform(field_specifiers=(attrib, field))
|
| 159 |
+
def define(
|
| 160 |
+
maybe_cls: _C,
|
| 161 |
+
*,
|
| 162 |
+
these: dict[str, Any] | None = ...,
|
| 163 |
+
repr: bool = ...,
|
| 164 |
+
unsafe_hash: bool | None = ...,
|
| 165 |
+
hash: bool | None = ...,
|
| 166 |
+
init: bool = ...,
|
| 167 |
+
slots: bool = ...,
|
| 168 |
+
frozen: bool = ...,
|
| 169 |
+
weakref_slot: bool = ...,
|
| 170 |
+
str: bool = ...,
|
| 171 |
+
auto_attribs: bool = ...,
|
| 172 |
+
kw_only: bool = ...,
|
| 173 |
+
cache_hash: bool = ...,
|
| 174 |
+
auto_exc: bool = ...,
|
| 175 |
+
eq: bool | None = ...,
|
| 176 |
+
order: bool | None = ...,
|
| 177 |
+
auto_detect: bool = ...,
|
| 178 |
+
getstate_setstate: bool | None = ...,
|
| 179 |
+
on_setattr: _OnSetAttrArgType | None = ...,
|
| 180 |
+
field_transformer: _FieldTransformer | None = ...,
|
| 181 |
+
match_args: bool = ...,
|
| 182 |
+
) -> _C: ...
|
| 183 |
+
@overload
|
| 184 |
+
@dataclass_transform(field_specifiers=(attrib, field))
|
| 185 |
+
def define(
|
| 186 |
+
maybe_cls: None = ...,
|
| 187 |
+
*,
|
| 188 |
+
these: dict[str, Any] | None = ...,
|
| 189 |
+
repr: bool = ...,
|
| 190 |
+
unsafe_hash: bool | None = ...,
|
| 191 |
+
hash: bool | None = ...,
|
| 192 |
+
init: bool = ...,
|
| 193 |
+
slots: bool = ...,
|
| 194 |
+
frozen: bool = ...,
|
| 195 |
+
weakref_slot: bool = ...,
|
| 196 |
+
str: bool = ...,
|
| 197 |
+
auto_attribs: bool = ...,
|
| 198 |
+
kw_only: bool = ...,
|
| 199 |
+
cache_hash: bool = ...,
|
| 200 |
+
auto_exc: bool = ...,
|
| 201 |
+
eq: bool | None = ...,
|
| 202 |
+
order: bool | None = ...,
|
| 203 |
+
auto_detect: bool = ...,
|
| 204 |
+
getstate_setstate: bool | None = ...,
|
| 205 |
+
on_setattr: _OnSetAttrArgType | None = ...,
|
| 206 |
+
field_transformer: _FieldTransformer | None = ...,
|
| 207 |
+
match_args: bool = ...,
|
| 208 |
+
) -> Callable[[_C], _C]: ...
|
| 209 |
+
|
| 210 |
+
mutable = define
|
| 211 |
+
|
| 212 |
+
@overload
|
| 213 |
+
@dataclass_transform(frozen_default=True, field_specifiers=(attrib, field))
|
| 214 |
+
def frozen(
|
| 215 |
+
maybe_cls: _C,
|
| 216 |
+
*,
|
| 217 |
+
these: dict[str, Any] | None = ...,
|
| 218 |
+
repr: bool = ...,
|
| 219 |
+
unsafe_hash: bool | None = ...,
|
| 220 |
+
hash: bool | None = ...,
|
| 221 |
+
init: bool = ...,
|
| 222 |
+
slots: bool = ...,
|
| 223 |
+
frozen: bool = ...,
|
| 224 |
+
weakref_slot: bool = ...,
|
| 225 |
+
str: bool = ...,
|
| 226 |
+
auto_attribs: bool = ...,
|
| 227 |
+
kw_only: bool = ...,
|
| 228 |
+
cache_hash: bool = ...,
|
| 229 |
+
auto_exc: bool = ...,
|
| 230 |
+
eq: bool | None = ...,
|
| 231 |
+
order: bool | None = ...,
|
| 232 |
+
auto_detect: bool = ...,
|
| 233 |
+
getstate_setstate: bool | None = ...,
|
| 234 |
+
on_setattr: _OnSetAttrArgType | None = ...,
|
| 235 |
+
field_transformer: _FieldTransformer | None = ...,
|
| 236 |
+
match_args: bool = ...,
|
| 237 |
+
) -> _C: ...
|
| 238 |
+
@overload
|
| 239 |
+
@dataclass_transform(frozen_default=True, field_specifiers=(attrib, field))
|
| 240 |
+
def frozen(
|
| 241 |
+
maybe_cls: None = ...,
|
| 242 |
+
*,
|
| 243 |
+
these: dict[str, Any] | None = ...,
|
| 244 |
+
repr: bool = ...,
|
| 245 |
+
unsafe_hash: bool | None = ...,
|
| 246 |
+
hash: bool | None = ...,
|
| 247 |
+
init: bool = ...,
|
| 248 |
+
slots: bool = ...,
|
| 249 |
+
frozen: bool = ...,
|
| 250 |
+
weakref_slot: bool = ...,
|
| 251 |
+
str: bool = ...,
|
| 252 |
+
auto_attribs: bool = ...,
|
| 253 |
+
kw_only: bool = ...,
|
| 254 |
+
cache_hash: bool = ...,
|
| 255 |
+
auto_exc: bool = ...,
|
| 256 |
+
eq: bool | None = ...,
|
| 257 |
+
order: bool | None = ...,
|
| 258 |
+
auto_detect: bool = ...,
|
| 259 |
+
getstate_setstate: bool | None = ...,
|
| 260 |
+
on_setattr: _OnSetAttrArgType | None = ...,
|
| 261 |
+
field_transformer: _FieldTransformer | None = ...,
|
| 262 |
+
match_args: bool = ...,
|
| 263 |
+
) -> Callable[[_C], _C]: ...
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs/converters.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: MIT
|
| 2 |
+
|
| 3 |
+
from attr.converters import * # noqa: F403
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs/exceptions.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: MIT
|
| 2 |
+
|
| 3 |
+
from attr.exceptions import * # noqa: F403
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs/filters.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: MIT
|
| 2 |
+
|
| 3 |
+
from attr.filters import * # noqa: F403
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs/py.typed
ADDED
|
File without changes
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs/setters.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: MIT
|
| 2 |
+
|
| 3 |
+
from attr.setters import * # noqa: F403
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/attrs/validators.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: MIT
|
| 2 |
+
|
| 3 |
+
from attr.validators import * # noqa: F403
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/__init__.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
__version__ = "2.20.0"
|
| 16 |
+
|
| 17 |
+
from .arrow_dataset import Dataset
|
| 18 |
+
from .arrow_reader import ReadInstruction
|
| 19 |
+
from .builder import ArrowBasedBuilder, BeamBasedBuilder, BuilderConfig, DatasetBuilder, GeneratorBasedBuilder
|
| 20 |
+
from .combine import concatenate_datasets, interleave_datasets
|
| 21 |
+
from .dataset_dict import DatasetDict, IterableDatasetDict
|
| 22 |
+
from .download import *
|
| 23 |
+
from .features import *
|
| 24 |
+
from .fingerprint import disable_caching, enable_caching, is_caching_enabled, set_caching_enabled
|
| 25 |
+
from .info import DatasetInfo, MetricInfo
|
| 26 |
+
from .inspect import (
|
| 27 |
+
get_dataset_config_info,
|
| 28 |
+
get_dataset_config_names,
|
| 29 |
+
get_dataset_default_config_name,
|
| 30 |
+
get_dataset_infos,
|
| 31 |
+
get_dataset_split_names,
|
| 32 |
+
inspect_dataset,
|
| 33 |
+
inspect_metric,
|
| 34 |
+
list_datasets,
|
| 35 |
+
list_metrics,
|
| 36 |
+
)
|
| 37 |
+
from .iterable_dataset import IterableDataset
|
| 38 |
+
from .load import load_dataset, load_dataset_builder, load_from_disk, load_metric
|
| 39 |
+
from .metric import Metric
|
| 40 |
+
from .splits import (
|
| 41 |
+
NamedSplit,
|
| 42 |
+
NamedSplitAll,
|
| 43 |
+
Split,
|
| 44 |
+
SplitBase,
|
| 45 |
+
SplitDict,
|
| 46 |
+
SplitGenerator,
|
| 47 |
+
SplitInfo,
|
| 48 |
+
SubSplitInfo,
|
| 49 |
+
percent,
|
| 50 |
+
)
|
| 51 |
+
from .tasks import *
|
| 52 |
+
from .utils import *
|
| 53 |
+
from .utils import logging
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
# isort: split
|
| 57 |
+
|
| 58 |
+
# Deprecated modules
|
| 59 |
+
from . import arrow_dataset as _arrow_dataset
|
| 60 |
+
from . import utils as _utils
|
| 61 |
+
from .exceptions import ExpectedMoreDownloadedFiles, ExpectedMoreSplits, UnexpectedDownloadedFile, UnexpectedSplits
|
| 62 |
+
from .utils import download_manager as _deprecated_download_manager
|
| 63 |
+
from .utils import info_utils as _deprecated_info_utils
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
_arrow_dataset.concatenate_datasets = concatenate_datasets
|
| 67 |
+
_utils.DownloadConfig = DownloadConfig
|
| 68 |
+
_utils.DownloadManager = DownloadManager
|
| 69 |
+
_utils.DownloadMode = DownloadMode
|
| 70 |
+
_deprecated_download_manager.DownloadConfig = DownloadConfig
|
| 71 |
+
_deprecated_download_manager.DownloadMode = DownloadMode
|
| 72 |
+
_deprecated_download_manager.DownloadManager = DownloadManager
|
| 73 |
+
_deprecated_info_utils.ExpectedMoreDownloadedFiles = ExpectedMoreDownloadedFiles
|
| 74 |
+
_deprecated_info_utils.ExpectedMoreSplits = ExpectedMoreSplits
|
| 75 |
+
_deprecated_info_utils.UnexpectedDownloadedFile = UnexpectedDownloadedFile
|
| 76 |
+
_deprecated_info_utils.UnexpectedSplits = UnexpectedSplits
|
| 77 |
+
|
| 78 |
+
del _arrow_dataset, _utils, _deprecated_download_manager
|
| 79 |
+
del _deprecated_info_utils, ExpectedMoreDownloadedFiles, ExpectedMoreSplits, UnexpectedDownloadedFile, UnexpectedSplits
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/arrow_writer.py
ADDED
|
@@ -0,0 +1,746 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 8 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 9 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 10 |
+
# See the License for the specific language governing permissions and
|
| 11 |
+
# limitations under the License.
|
| 12 |
+
|
| 13 |
+
# Lint as: python3
|
| 14 |
+
"""To write records into Parquet files."""
|
| 15 |
+
|
| 16 |
+
import errno
|
| 17 |
+
import json
|
| 18 |
+
import os
|
| 19 |
+
import sys
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
| 22 |
+
|
| 23 |
+
import fsspec
|
| 24 |
+
import numpy as np
|
| 25 |
+
import pyarrow as pa
|
| 26 |
+
import pyarrow.parquet as pq
|
| 27 |
+
from fsspec.core import url_to_fs
|
| 28 |
+
|
| 29 |
+
from . import config
|
| 30 |
+
from .features import Features, Image, Value
|
| 31 |
+
from .features.features import (
|
| 32 |
+
FeatureType,
|
| 33 |
+
_ArrayXDExtensionType,
|
| 34 |
+
cast_to_python_objects,
|
| 35 |
+
generate_from_arrow_type,
|
| 36 |
+
get_nested_type,
|
| 37 |
+
list_of_np_array_to_pyarrow_listarray,
|
| 38 |
+
numpy_to_pyarrow_listarray,
|
| 39 |
+
to_pyarrow_listarray,
|
| 40 |
+
)
|
| 41 |
+
from .filesystems import is_remote_filesystem
|
| 42 |
+
from .info import DatasetInfo
|
| 43 |
+
from .keyhash import DuplicatedKeysError, KeyHasher
|
| 44 |
+
from .table import array_cast, cast_array_to_feature, embed_table_storage, table_cast
|
| 45 |
+
from .utils import logging
|
| 46 |
+
from .utils import tqdm as hf_tqdm
|
| 47 |
+
from .utils.file_utils import hash_url_to_filename
|
| 48 |
+
from .utils.py_utils import asdict, first_non_null_value
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
logger = logging.get_logger(__name__)
|
| 52 |
+
|
| 53 |
+
type_ = type # keep python's type function
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class SchemaInferenceError(ValueError):
|
| 57 |
+
pass
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class TypedSequence:
|
| 61 |
+
"""
|
| 62 |
+
This data container generalizes the typing when instantiating pyarrow arrays, tables or batches.
|
| 63 |
+
|
| 64 |
+
More specifically it adds several features:
|
| 65 |
+
- Support extension types like ``datasets.features.Array2DExtensionType``:
|
| 66 |
+
By default pyarrow arrays don't return extension arrays. One has to call
|
| 67 |
+
``pa.ExtensionArray.from_storage(type, pa.array(data, type.storage_type))``
|
| 68 |
+
in order to get an extension array.
|
| 69 |
+
- Support for ``try_type`` parameter that can be used instead of ``type``:
|
| 70 |
+
When an array is transformed, we like to keep the same type as before if possible.
|
| 71 |
+
For example when calling :func:`datasets.Dataset.map`, we don't want to change the type
|
| 72 |
+
of each column by default.
|
| 73 |
+
- Better error message when a pyarrow array overflows.
|
| 74 |
+
|
| 75 |
+
Example::
|
| 76 |
+
|
| 77 |
+
from datasets.features import Array2D, Array2DExtensionType, Value
|
| 78 |
+
from datasets.arrow_writer import TypedSequence
|
| 79 |
+
import pyarrow as pa
|
| 80 |
+
|
| 81 |
+
arr = pa.array(TypedSequence([1, 2, 3], type=Value("int32")))
|
| 82 |
+
assert arr.type == pa.int32()
|
| 83 |
+
|
| 84 |
+
arr = pa.array(TypedSequence([1, 2, 3], try_type=Value("int32")))
|
| 85 |
+
assert arr.type == pa.int32()
|
| 86 |
+
|
| 87 |
+
arr = pa.array(TypedSequence(["foo", "bar"], try_type=Value("int32")))
|
| 88 |
+
assert arr.type == pa.string()
|
| 89 |
+
|
| 90 |
+
arr = pa.array(TypedSequence([[[1, 2, 3]]], type=Array2D((1, 3), "int64")))
|
| 91 |
+
assert arr.type == Array2DExtensionType((1, 3), "int64")
|
| 92 |
+
|
| 93 |
+
table = pa.Table.from_pydict({
|
| 94 |
+
"image": TypedSequence([[[1, 2, 3]]], type=Array2D((1, 3), "int64"))
|
| 95 |
+
})
|
| 96 |
+
assert table["image"].type == Array2DExtensionType((1, 3), "int64")
|
| 97 |
+
|
| 98 |
+
"""
|
| 99 |
+
|
| 100 |
+
def __init__(
|
| 101 |
+
self,
|
| 102 |
+
data: Iterable,
|
| 103 |
+
type: Optional[FeatureType] = None,
|
| 104 |
+
try_type: Optional[FeatureType] = None,
|
| 105 |
+
optimized_int_type: Optional[FeatureType] = None,
|
| 106 |
+
):
|
| 107 |
+
# assert type is None or try_type is None,
|
| 108 |
+
if type is not None and try_type is not None:
|
| 109 |
+
raise ValueError("You cannot specify both type and try_type")
|
| 110 |
+
# set attributes
|
| 111 |
+
self.data = data
|
| 112 |
+
self.type = type
|
| 113 |
+
self.try_type = try_type # is ignored if it doesn't match the data
|
| 114 |
+
self.optimized_int_type = optimized_int_type
|
| 115 |
+
# when trying a type (is ignored if data is not compatible)
|
| 116 |
+
self.trying_type = self.try_type is not None
|
| 117 |
+
self.trying_int_optimization = optimized_int_type is not None and type is None and try_type is None
|
| 118 |
+
# used to get back the inferred type after __arrow_array__() is called once
|
| 119 |
+
self._inferred_type = None
|
| 120 |
+
|
| 121 |
+
def get_inferred_type(self) -> FeatureType:
|
| 122 |
+
"""Return the inferred feature type.
|
| 123 |
+
This is done by converting the sequence to an Arrow array, and getting the corresponding
|
| 124 |
+
feature type.
|
| 125 |
+
|
| 126 |
+
Since building the Arrow array can be expensive, the value of the inferred type is cached
|
| 127 |
+
as soon as pa.array is called on the typed sequence.
|
| 128 |
+
|
| 129 |
+
Returns:
|
| 130 |
+
FeatureType: inferred feature type of the sequence.
|
| 131 |
+
"""
|
| 132 |
+
if self._inferred_type is None:
|
| 133 |
+
self._inferred_type = generate_from_arrow_type(pa.array(self).type)
|
| 134 |
+
return self._inferred_type
|
| 135 |
+
|
| 136 |
+
@staticmethod
|
| 137 |
+
def _infer_custom_type_and_encode(data: Iterable) -> Tuple[Iterable, Optional[FeatureType]]:
|
| 138 |
+
"""Implement type inference for custom objects like PIL.Image.Image -> Image type.
|
| 139 |
+
|
| 140 |
+
This function is only used for custom python objects that can't be direclty passed to build
|
| 141 |
+
an Arrow array. In such cases is infers the feature type to use, and it encodes the data so
|
| 142 |
+
that they can be passed to an Arrow array.
|
| 143 |
+
|
| 144 |
+
Args:
|
| 145 |
+
data (Iterable): array of data to infer the type, e.g. a list of PIL images.
|
| 146 |
+
|
| 147 |
+
Returns:
|
| 148 |
+
Tuple[Iterable, Optional[FeatureType]]: a tuple with:
|
| 149 |
+
- the (possibly encoded) array, if the inferred feature type requires encoding
|
| 150 |
+
- the inferred feature type if the array is made of supported custom objects like
|
| 151 |
+
PIL images, else None.
|
| 152 |
+
"""
|
| 153 |
+
if config.PIL_AVAILABLE and "PIL" in sys.modules:
|
| 154 |
+
import PIL.Image
|
| 155 |
+
|
| 156 |
+
non_null_idx, non_null_value = first_non_null_value(data)
|
| 157 |
+
if isinstance(non_null_value, PIL.Image.Image):
|
| 158 |
+
return [Image().encode_example(value) if value is not None else None for value in data], Image()
|
| 159 |
+
return data, None
|
| 160 |
+
|
| 161 |
+
def __arrow_array__(self, type: Optional[pa.DataType] = None):
|
| 162 |
+
"""This function is called when calling pa.array(typed_sequence)"""
|
| 163 |
+
|
| 164 |
+
if type is not None:
|
| 165 |
+
raise ValueError("TypedSequence is supposed to be used with pa.array(typed_sequence, type=None)")
|
| 166 |
+
del type # make sure we don't use it
|
| 167 |
+
data = self.data
|
| 168 |
+
# automatic type inference for custom objects
|
| 169 |
+
if self.type is None and self.try_type is None:
|
| 170 |
+
data, self._inferred_type = self._infer_custom_type_and_encode(data)
|
| 171 |
+
if self._inferred_type is None:
|
| 172 |
+
type = self.try_type if self.trying_type else self.type
|
| 173 |
+
else:
|
| 174 |
+
type = self._inferred_type
|
| 175 |
+
pa_type = get_nested_type(type) if type is not None else None
|
| 176 |
+
optimized_int_pa_type = (
|
| 177 |
+
get_nested_type(self.optimized_int_type) if self.optimized_int_type is not None else None
|
| 178 |
+
)
|
| 179 |
+
trying_cast_to_python_objects = False
|
| 180 |
+
try:
|
| 181 |
+
# custom pyarrow types
|
| 182 |
+
if isinstance(pa_type, _ArrayXDExtensionType):
|
| 183 |
+
storage = to_pyarrow_listarray(data, pa_type)
|
| 184 |
+
return pa.ExtensionArray.from_storage(pa_type, storage)
|
| 185 |
+
|
| 186 |
+
# efficient np array to pyarrow array
|
| 187 |
+
if isinstance(data, np.ndarray):
|
| 188 |
+
out = numpy_to_pyarrow_listarray(data)
|
| 189 |
+
elif isinstance(data, list) and data and isinstance(first_non_null_value(data)[1], np.ndarray):
|
| 190 |
+
out = list_of_np_array_to_pyarrow_listarray(data)
|
| 191 |
+
else:
|
| 192 |
+
trying_cast_to_python_objects = True
|
| 193 |
+
out = pa.array(cast_to_python_objects(data, only_1d_for_numpy=True))
|
| 194 |
+
# use smaller integer precisions if possible
|
| 195 |
+
if self.trying_int_optimization:
|
| 196 |
+
if pa.types.is_int64(out.type):
|
| 197 |
+
out = out.cast(optimized_int_pa_type)
|
| 198 |
+
elif pa.types.is_list(out.type):
|
| 199 |
+
if pa.types.is_int64(out.type.value_type):
|
| 200 |
+
out = array_cast(out, pa.list_(optimized_int_pa_type))
|
| 201 |
+
elif pa.types.is_list(out.type.value_type) and pa.types.is_int64(out.type.value_type.value_type):
|
| 202 |
+
out = array_cast(out, pa.list_(pa.list_(optimized_int_pa_type)))
|
| 203 |
+
# otherwise we can finally use the user's type
|
| 204 |
+
elif type is not None:
|
| 205 |
+
# We use cast_array_to_feature to support casting to custom types like Audio and Image
|
| 206 |
+
# Also, when trying type "string", we don't want to convert integers or floats to "string".
|
| 207 |
+
# We only do it if trying_type is False - since this is what the user asks for.
|
| 208 |
+
out = cast_array_to_feature(
|
| 209 |
+
out, type, allow_primitive_to_str=not self.trying_type, allow_decimal_to_str=not self.trying_type
|
| 210 |
+
)
|
| 211 |
+
return out
|
| 212 |
+
except (
|
| 213 |
+
TypeError,
|
| 214 |
+
pa.lib.ArrowInvalid,
|
| 215 |
+
pa.lib.ArrowNotImplementedError,
|
| 216 |
+
) as e: # handle type errors and overflows
|
| 217 |
+
# Ignore ArrowNotImplementedError caused by trying type, otherwise re-raise
|
| 218 |
+
if not self.trying_type and isinstance(e, pa.lib.ArrowNotImplementedError):
|
| 219 |
+
raise
|
| 220 |
+
|
| 221 |
+
if self.trying_type:
|
| 222 |
+
try: # second chance
|
| 223 |
+
if isinstance(data, np.ndarray):
|
| 224 |
+
return numpy_to_pyarrow_listarray(data)
|
| 225 |
+
elif isinstance(data, list) and data and any(isinstance(value, np.ndarray) for value in data):
|
| 226 |
+
return list_of_np_array_to_pyarrow_listarray(data)
|
| 227 |
+
else:
|
| 228 |
+
trying_cast_to_python_objects = True
|
| 229 |
+
return pa.array(cast_to_python_objects(data, only_1d_for_numpy=True))
|
| 230 |
+
except pa.lib.ArrowInvalid as e:
|
| 231 |
+
if "overflow" in str(e):
|
| 232 |
+
raise OverflowError(
|
| 233 |
+
f"There was an overflow with type {type_(data)}. Try to reduce writer_batch_size to have batches smaller than 2GB.\n({e})"
|
| 234 |
+
) from None
|
| 235 |
+
elif self.trying_int_optimization and "not in range" in str(e):
|
| 236 |
+
optimized_int_pa_type_str = np.dtype(optimized_int_pa_type.to_pandas_dtype()).name
|
| 237 |
+
logger.info(
|
| 238 |
+
f"Failed to cast a sequence to {optimized_int_pa_type_str}. Falling back to int64."
|
| 239 |
+
)
|
| 240 |
+
return out
|
| 241 |
+
elif trying_cast_to_python_objects and "Could not convert" in str(e):
|
| 242 |
+
out = pa.array(
|
| 243 |
+
cast_to_python_objects(data, only_1d_for_numpy=True, optimize_list_casting=False)
|
| 244 |
+
)
|
| 245 |
+
if type is not None:
|
| 246 |
+
out = cast_array_to_feature(
|
| 247 |
+
out, type, allow_primitive_to_str=True, allow_decimal_to_str=True
|
| 248 |
+
)
|
| 249 |
+
return out
|
| 250 |
+
else:
|
| 251 |
+
raise
|
| 252 |
+
elif "overflow" in str(e):
|
| 253 |
+
raise OverflowError(
|
| 254 |
+
f"There was an overflow with type {type_(data)}. Try to reduce writer_batch_size to have batches smaller than 2GB.\n({e})"
|
| 255 |
+
) from None
|
| 256 |
+
elif self.trying_int_optimization and "not in range" in str(e):
|
| 257 |
+
optimized_int_pa_type_str = np.dtype(optimized_int_pa_type.to_pandas_dtype()).name
|
| 258 |
+
logger.info(f"Failed to cast a sequence to {optimized_int_pa_type_str}. Falling back to int64.")
|
| 259 |
+
return out
|
| 260 |
+
elif trying_cast_to_python_objects and "Could not convert" in str(e):
|
| 261 |
+
out = pa.array(cast_to_python_objects(data, only_1d_for_numpy=True, optimize_list_casting=False))
|
| 262 |
+
if type is not None:
|
| 263 |
+
out = cast_array_to_feature(out, type, allow_primitive_to_str=True, allow_decimal_to_str=True)
|
| 264 |
+
return out
|
| 265 |
+
else:
|
| 266 |
+
raise
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
class OptimizedTypedSequence(TypedSequence):
|
| 270 |
+
def __init__(
|
| 271 |
+
self,
|
| 272 |
+
data,
|
| 273 |
+
type: Optional[FeatureType] = None,
|
| 274 |
+
try_type: Optional[FeatureType] = None,
|
| 275 |
+
col: Optional[str] = None,
|
| 276 |
+
optimized_int_type: Optional[FeatureType] = None,
|
| 277 |
+
):
|
| 278 |
+
optimized_int_type_by_col = {
|
| 279 |
+
"attention_mask": Value("int8"), # binary tensor
|
| 280 |
+
"special_tokens_mask": Value("int8"),
|
| 281 |
+
"input_ids": Value("int32"), # typical vocab size: 0-50k (max ~500k, never > 1M)
|
| 282 |
+
"token_type_ids": Value(
|
| 283 |
+
"int8"
|
| 284 |
+
), # binary mask; some (XLNetModel) use an additional token represented by a 2
|
| 285 |
+
}
|
| 286 |
+
if type is None and try_type is None:
|
| 287 |
+
optimized_int_type = optimized_int_type_by_col.get(col, None)
|
| 288 |
+
super().__init__(data, type=type, try_type=try_type, optimized_int_type=optimized_int_type)
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
class ArrowWriter:
|
| 292 |
+
"""Shuffles and writes Examples to Arrow files."""
|
| 293 |
+
|
| 294 |
+
_WRITER_CLASS = pa.RecordBatchStreamWriter
|
| 295 |
+
|
| 296 |
+
def __init__(
|
| 297 |
+
self,
|
| 298 |
+
schema: Optional[pa.Schema] = None,
|
| 299 |
+
features: Optional[Features] = None,
|
| 300 |
+
path: Optional[str] = None,
|
| 301 |
+
stream: Optional[pa.NativeFile] = None,
|
| 302 |
+
fingerprint: Optional[str] = None,
|
| 303 |
+
writer_batch_size: Optional[int] = None,
|
| 304 |
+
hash_salt: Optional[str] = None,
|
| 305 |
+
check_duplicates: Optional[bool] = False,
|
| 306 |
+
disable_nullable: bool = False,
|
| 307 |
+
update_features: bool = False,
|
| 308 |
+
with_metadata: bool = True,
|
| 309 |
+
unit: str = "examples",
|
| 310 |
+
embed_local_files: bool = False,
|
| 311 |
+
storage_options: Optional[dict] = None,
|
| 312 |
+
):
|
| 313 |
+
if path is None and stream is None:
|
| 314 |
+
raise ValueError("At least one of path and stream must be provided.")
|
| 315 |
+
if features is not None:
|
| 316 |
+
self._features = features
|
| 317 |
+
self._schema = None
|
| 318 |
+
elif schema is not None:
|
| 319 |
+
self._schema: pa.Schema = schema
|
| 320 |
+
self._features = Features.from_arrow_schema(self._schema)
|
| 321 |
+
else:
|
| 322 |
+
self._features = None
|
| 323 |
+
self._schema = None
|
| 324 |
+
|
| 325 |
+
if hash_salt is not None:
|
| 326 |
+
# Create KeyHasher instance using split name as hash salt
|
| 327 |
+
self._hasher = KeyHasher(hash_salt)
|
| 328 |
+
else:
|
| 329 |
+
self._hasher = KeyHasher("")
|
| 330 |
+
|
| 331 |
+
self._check_duplicates = check_duplicates
|
| 332 |
+
self._disable_nullable = disable_nullable
|
| 333 |
+
|
| 334 |
+
if stream is None:
|
| 335 |
+
fs, path = url_to_fs(path, **(storage_options or {}))
|
| 336 |
+
self._fs: fsspec.AbstractFileSystem = fs
|
| 337 |
+
self._path = path if not is_remote_filesystem(self._fs) else self._fs.unstrip_protocol(path)
|
| 338 |
+
self.stream = self._fs.open(path, "wb")
|
| 339 |
+
self._closable_stream = True
|
| 340 |
+
else:
|
| 341 |
+
self._fs = None
|
| 342 |
+
self._path = None
|
| 343 |
+
self.stream = stream
|
| 344 |
+
self._closable_stream = False
|
| 345 |
+
|
| 346 |
+
self.fingerprint = fingerprint
|
| 347 |
+
self.disable_nullable = disable_nullable
|
| 348 |
+
self.writer_batch_size = writer_batch_size or config.DEFAULT_MAX_BATCH_SIZE
|
| 349 |
+
self.update_features = update_features
|
| 350 |
+
self.with_metadata = with_metadata
|
| 351 |
+
self.unit = unit
|
| 352 |
+
self.embed_local_files = embed_local_files
|
| 353 |
+
|
| 354 |
+
self._num_examples = 0
|
| 355 |
+
self._num_bytes = 0
|
| 356 |
+
self.current_examples: List[Tuple[Dict[str, Any], str]] = []
|
| 357 |
+
self.current_rows: List[pa.Table] = []
|
| 358 |
+
self.pa_writer: Optional[pa.RecordBatchStreamWriter] = None
|
| 359 |
+
self.hkey_record = []
|
| 360 |
+
|
| 361 |
+
def __len__(self):
|
| 362 |
+
"""Return the number of writed and staged examples"""
|
| 363 |
+
return self._num_examples + len(self.current_examples) + len(self.current_rows)
|
| 364 |
+
|
| 365 |
+
def __enter__(self):
|
| 366 |
+
return self
|
| 367 |
+
|
| 368 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 369 |
+
self.close()
|
| 370 |
+
|
| 371 |
+
def close(self):
|
| 372 |
+
# Try closing if opened; if closed: pyarrow.lib.ArrowInvalid: Invalid operation on closed file
|
| 373 |
+
if self.pa_writer: # it might be None
|
| 374 |
+
try:
|
| 375 |
+
self.pa_writer.close()
|
| 376 |
+
except Exception: # pyarrow.lib.ArrowInvalid, OSError
|
| 377 |
+
pass
|
| 378 |
+
if self._closable_stream and not self.stream.closed:
|
| 379 |
+
self.stream.close() # This also closes self.pa_writer if it is opened
|
| 380 |
+
|
| 381 |
+
def _build_writer(self, inferred_schema: pa.Schema):
|
| 382 |
+
schema = self.schema
|
| 383 |
+
inferred_features = Features.from_arrow_schema(inferred_schema)
|
| 384 |
+
if self._features is not None:
|
| 385 |
+
if self.update_features: # keep original features it they match, or update them
|
| 386 |
+
fields = {field.name: field for field in self._features.type}
|
| 387 |
+
for inferred_field in inferred_features.type:
|
| 388 |
+
name = inferred_field.name
|
| 389 |
+
if name in fields:
|
| 390 |
+
if inferred_field == fields[name]:
|
| 391 |
+
inferred_features[name] = self._features[name]
|
| 392 |
+
self._features = inferred_features
|
| 393 |
+
schema: pa.Schema = inferred_schema
|
| 394 |
+
else:
|
| 395 |
+
self._features = inferred_features
|
| 396 |
+
schema: pa.Schema = inferred_features.arrow_schema
|
| 397 |
+
if self.disable_nullable:
|
| 398 |
+
schema = pa.schema(pa.field(field.name, field.type, nullable=False) for field in schema)
|
| 399 |
+
if self.with_metadata:
|
| 400 |
+
schema = schema.with_metadata(self._build_metadata(DatasetInfo(features=self._features), self.fingerprint))
|
| 401 |
+
else:
|
| 402 |
+
schema = schema.with_metadata({})
|
| 403 |
+
self._schema = schema
|
| 404 |
+
self.pa_writer = self._WRITER_CLASS(self.stream, schema)
|
| 405 |
+
|
| 406 |
+
@property
|
| 407 |
+
def schema(self):
|
| 408 |
+
_schema = (
|
| 409 |
+
self._schema
|
| 410 |
+
if self._schema is not None
|
| 411 |
+
else (pa.schema(self._features.type) if self._features is not None else None)
|
| 412 |
+
)
|
| 413 |
+
if self._disable_nullable and _schema is not None:
|
| 414 |
+
_schema = pa.schema(pa.field(field.name, field.type, nullable=False) for field in _schema)
|
| 415 |
+
return _schema if _schema is not None else []
|
| 416 |
+
|
| 417 |
+
@staticmethod
|
| 418 |
+
def _build_metadata(info: DatasetInfo, fingerprint: Optional[str] = None) -> Dict[str, str]:
|
| 419 |
+
info_keys = ["features"] # we can add support for more DatasetInfo keys in the future
|
| 420 |
+
info_as_dict = asdict(info)
|
| 421 |
+
metadata = {}
|
| 422 |
+
metadata["info"] = {key: info_as_dict[key] for key in info_keys}
|
| 423 |
+
if fingerprint is not None:
|
| 424 |
+
metadata["fingerprint"] = fingerprint
|
| 425 |
+
return {"huggingface": json.dumps(metadata)}
|
| 426 |
+
|
| 427 |
+
def write_examples_on_file(self):
|
| 428 |
+
"""Write stored examples from the write-pool of examples. It makes a table out of the examples and write it."""
|
| 429 |
+
if not self.current_examples:
|
| 430 |
+
return
|
| 431 |
+
# preserve the order the columns
|
| 432 |
+
if self.schema:
|
| 433 |
+
schema_cols = set(self.schema.names)
|
| 434 |
+
examples_cols = self.current_examples[0][0].keys() # .keys() preserves the order (unlike set)
|
| 435 |
+
common_cols = [col for col in self.schema.names if col in examples_cols]
|
| 436 |
+
extra_cols = [col for col in examples_cols if col not in schema_cols]
|
| 437 |
+
cols = common_cols + extra_cols
|
| 438 |
+
else:
|
| 439 |
+
cols = list(self.current_examples[0][0])
|
| 440 |
+
batch_examples = {}
|
| 441 |
+
for col in cols:
|
| 442 |
+
# We use row[0][col] since current_examples contains (example, key) tuples.
|
| 443 |
+
# Morever, examples could be Arrow arrays of 1 element.
|
| 444 |
+
# This can happen in `.map()` when we want to re-write the same Arrow data
|
| 445 |
+
if all(isinstance(row[0][col], (pa.Array, pa.ChunkedArray)) for row in self.current_examples):
|
| 446 |
+
arrays = [row[0][col] for row in self.current_examples]
|
| 447 |
+
arrays = [
|
| 448 |
+
chunk
|
| 449 |
+
for array in arrays
|
| 450 |
+
for chunk in (array.chunks if isinstance(array, pa.ChunkedArray) else [array])
|
| 451 |
+
]
|
| 452 |
+
batch_examples[col] = pa.concat_arrays(arrays)
|
| 453 |
+
else:
|
| 454 |
+
batch_examples[col] = [
|
| 455 |
+
row[0][col].to_pylist()[0] if isinstance(row[0][col], (pa.Array, pa.ChunkedArray)) else row[0][col]
|
| 456 |
+
for row in self.current_examples
|
| 457 |
+
]
|
| 458 |
+
self.write_batch(batch_examples=batch_examples)
|
| 459 |
+
self.current_examples = []
|
| 460 |
+
|
| 461 |
+
def write_rows_on_file(self):
|
| 462 |
+
"""Write stored rows from the write-pool of rows. It concatenates the single-row tables and it writes the resulting table."""
|
| 463 |
+
if not self.current_rows:
|
| 464 |
+
return
|
| 465 |
+
table = pa.concat_tables(self.current_rows)
|
| 466 |
+
self.write_table(table)
|
| 467 |
+
self.current_rows = []
|
| 468 |
+
|
| 469 |
+
def write(
|
| 470 |
+
self,
|
| 471 |
+
example: Dict[str, Any],
|
| 472 |
+
key: Optional[Union[str, int, bytes]] = None,
|
| 473 |
+
writer_batch_size: Optional[int] = None,
|
| 474 |
+
):
|
| 475 |
+
"""Add a given (Example,Key) pair to the write-pool of examples which is written to file.
|
| 476 |
+
|
| 477 |
+
Args:
|
| 478 |
+
example: the Example to add.
|
| 479 |
+
key: Optional, a unique identifier(str, int or bytes) associated with each example
|
| 480 |
+
"""
|
| 481 |
+
# Utilize the keys and duplicate checking when `self._check_duplicates` is passed True
|
| 482 |
+
if self._check_duplicates:
|
| 483 |
+
# Create unique hash from key and store as (key, example) pairs
|
| 484 |
+
hash = self._hasher.hash(key)
|
| 485 |
+
self.current_examples.append((example, hash))
|
| 486 |
+
# Maintain record of keys and their respective hashes for checking duplicates
|
| 487 |
+
self.hkey_record.append((hash, key))
|
| 488 |
+
else:
|
| 489 |
+
# Store example as a tuple so as to keep the structure of `self.current_examples` uniform
|
| 490 |
+
self.current_examples.append((example, ""))
|
| 491 |
+
|
| 492 |
+
if writer_batch_size is None:
|
| 493 |
+
writer_batch_size = self.writer_batch_size
|
| 494 |
+
if writer_batch_size is not None and len(self.current_examples) >= writer_batch_size:
|
| 495 |
+
if self._check_duplicates:
|
| 496 |
+
self.check_duplicate_keys()
|
| 497 |
+
# Re-intializing to empty list for next batch
|
| 498 |
+
self.hkey_record = []
|
| 499 |
+
|
| 500 |
+
self.write_examples_on_file()
|
| 501 |
+
|
| 502 |
+
def check_duplicate_keys(self):
|
| 503 |
+
"""Raises error if duplicates found in a batch"""
|
| 504 |
+
tmp_record = set()
|
| 505 |
+
for hash, key in self.hkey_record:
|
| 506 |
+
if hash in tmp_record:
|
| 507 |
+
duplicate_key_indices = [
|
| 508 |
+
str(self._num_examples + index)
|
| 509 |
+
for index, (duplicate_hash, _) in enumerate(self.hkey_record)
|
| 510 |
+
if duplicate_hash == hash
|
| 511 |
+
]
|
| 512 |
+
|
| 513 |
+
raise DuplicatedKeysError(key, duplicate_key_indices)
|
| 514 |
+
else:
|
| 515 |
+
tmp_record.add(hash)
|
| 516 |
+
|
| 517 |
+
def write_row(self, row: pa.Table, writer_batch_size: Optional[int] = None):
|
| 518 |
+
"""Add a given single-row Table to the write-pool of rows which is written to file.
|
| 519 |
+
|
| 520 |
+
Args:
|
| 521 |
+
row: the row to add.
|
| 522 |
+
"""
|
| 523 |
+
if len(row) != 1:
|
| 524 |
+
raise ValueError(f"Only single-row pyarrow tables are allowed but got table with {len(row)} rows.")
|
| 525 |
+
self.current_rows.append(row)
|
| 526 |
+
if writer_batch_size is None:
|
| 527 |
+
writer_batch_size = self.writer_batch_size
|
| 528 |
+
if writer_batch_size is not None and len(self.current_rows) >= writer_batch_size:
|
| 529 |
+
self.write_rows_on_file()
|
| 530 |
+
|
| 531 |
+
def write_batch(
|
| 532 |
+
self,
|
| 533 |
+
batch_examples: Dict[str, List],
|
| 534 |
+
writer_batch_size: Optional[int] = None,
|
| 535 |
+
):
|
| 536 |
+
"""Write a batch of Example to file.
|
| 537 |
+
Ignores the batch if it appears to be empty,
|
| 538 |
+
preventing a potential schema update of unknown types.
|
| 539 |
+
|
| 540 |
+
Args:
|
| 541 |
+
batch_examples: the batch of examples to add.
|
| 542 |
+
"""
|
| 543 |
+
if batch_examples and len(next(iter(batch_examples.values()))) == 0:
|
| 544 |
+
return
|
| 545 |
+
features = None if self.pa_writer is None and self.update_features else self._features
|
| 546 |
+
try_features = self._features if self.pa_writer is None and self.update_features else None
|
| 547 |
+
arrays = []
|
| 548 |
+
inferred_features = Features()
|
| 549 |
+
# preserve the order the columns
|
| 550 |
+
if self.schema:
|
| 551 |
+
schema_cols = set(self.schema.names)
|
| 552 |
+
batch_cols = batch_examples.keys() # .keys() preserves the order (unlike set)
|
| 553 |
+
common_cols = [col for col in self.schema.names if col in batch_cols]
|
| 554 |
+
extra_cols = [col for col in batch_cols if col not in schema_cols]
|
| 555 |
+
cols = common_cols + extra_cols
|
| 556 |
+
else:
|
| 557 |
+
cols = list(batch_examples)
|
| 558 |
+
for col in cols:
|
| 559 |
+
col_values = batch_examples[col]
|
| 560 |
+
col_type = features[col] if features else None
|
| 561 |
+
if isinstance(col_values, (pa.Array, pa.ChunkedArray)):
|
| 562 |
+
array = cast_array_to_feature(col_values, col_type) if col_type is not None else col_values
|
| 563 |
+
arrays.append(array)
|
| 564 |
+
inferred_features[col] = generate_from_arrow_type(col_values.type)
|
| 565 |
+
else:
|
| 566 |
+
col_try_type = try_features[col] if try_features is not None and col in try_features else None
|
| 567 |
+
typed_sequence = OptimizedTypedSequence(col_values, type=col_type, try_type=col_try_type, col=col)
|
| 568 |
+
arrays.append(pa.array(typed_sequence))
|
| 569 |
+
inferred_features[col] = typed_sequence.get_inferred_type()
|
| 570 |
+
schema = inferred_features.arrow_schema if self.pa_writer is None else self.schema
|
| 571 |
+
pa_table = pa.Table.from_arrays(arrays, schema=schema)
|
| 572 |
+
self.write_table(pa_table, writer_batch_size)
|
| 573 |
+
|
| 574 |
+
def write_table(self, pa_table: pa.Table, writer_batch_size: Optional[int] = None):
|
| 575 |
+
"""Write a Table to file.
|
| 576 |
+
|
| 577 |
+
Args:
|
| 578 |
+
example: the Table to add.
|
| 579 |
+
"""
|
| 580 |
+
if writer_batch_size is None:
|
| 581 |
+
writer_batch_size = self.writer_batch_size
|
| 582 |
+
if self.pa_writer is None:
|
| 583 |
+
self._build_writer(inferred_schema=pa_table.schema)
|
| 584 |
+
pa_table = pa_table.combine_chunks()
|
| 585 |
+
pa_table = table_cast(pa_table, self._schema)
|
| 586 |
+
if self.embed_local_files:
|
| 587 |
+
pa_table = embed_table_storage(pa_table)
|
| 588 |
+
self._num_bytes += pa_table.nbytes
|
| 589 |
+
self._num_examples += pa_table.num_rows
|
| 590 |
+
self.pa_writer.write_table(pa_table, writer_batch_size)
|
| 591 |
+
|
| 592 |
+
def finalize(self, close_stream=True):
|
| 593 |
+
self.write_rows_on_file()
|
| 594 |
+
# In case current_examples < writer_batch_size, but user uses finalize()
|
| 595 |
+
if self._check_duplicates:
|
| 596 |
+
self.check_duplicate_keys()
|
| 597 |
+
# Re-intializing to empty list for next batch
|
| 598 |
+
self.hkey_record = []
|
| 599 |
+
self.write_examples_on_file()
|
| 600 |
+
# If schema is known, infer features even if no examples were written
|
| 601 |
+
if self.pa_writer is None and self.schema:
|
| 602 |
+
self._build_writer(self.schema)
|
| 603 |
+
if self.pa_writer is not None:
|
| 604 |
+
self.pa_writer.close()
|
| 605 |
+
self.pa_writer = None
|
| 606 |
+
if close_stream:
|
| 607 |
+
self.stream.close()
|
| 608 |
+
else:
|
| 609 |
+
if close_stream:
|
| 610 |
+
self.stream.close()
|
| 611 |
+
raise SchemaInferenceError("Please pass `features` or at least one example when writing data")
|
| 612 |
+
logger.debug(
|
| 613 |
+
f"Done writing {self._num_examples} {self.unit} in {self._num_bytes} bytes {self._path if self._path else ''}."
|
| 614 |
+
)
|
| 615 |
+
return self._num_examples, self._num_bytes
|
| 616 |
+
|
| 617 |
+
|
| 618 |
+
class ParquetWriter(ArrowWriter):
|
| 619 |
+
_WRITER_CLASS = pq.ParquetWriter
|
| 620 |
+
|
| 621 |
+
|
| 622 |
+
class BeamWriter:
|
| 623 |
+
"""
|
| 624 |
+
Shuffles and writes Examples to Arrow files.
|
| 625 |
+
The Arrow files are converted from Parquet files that are the output of Apache Beam pipelines.
|
| 626 |
+
"""
|
| 627 |
+
|
| 628 |
+
def __init__(
|
| 629 |
+
self,
|
| 630 |
+
features: Optional[Features] = None,
|
| 631 |
+
schema: Optional[pa.Schema] = None,
|
| 632 |
+
path: Optional[str] = None,
|
| 633 |
+
namespace: Optional[str] = None,
|
| 634 |
+
cache_dir: Optional[str] = None,
|
| 635 |
+
):
|
| 636 |
+
if features is None and schema is None:
|
| 637 |
+
raise ValueError("At least one of features and schema must be provided.")
|
| 638 |
+
if path is None:
|
| 639 |
+
raise ValueError("Path must be provided.")
|
| 640 |
+
|
| 641 |
+
if features is not None:
|
| 642 |
+
self._features: Features = features
|
| 643 |
+
self._schema: pa.Schema = features.arrow_schema
|
| 644 |
+
else:
|
| 645 |
+
self._schema: pa.Schema = schema
|
| 646 |
+
self._features: Features = Features.from_arrow_schema(schema)
|
| 647 |
+
|
| 648 |
+
self._path = path
|
| 649 |
+
self._parquet_path = os.path.splitext(path)[0] # remove extension
|
| 650 |
+
self._namespace = namespace or "default"
|
| 651 |
+
self._num_examples = None
|
| 652 |
+
self._cache_dir = cache_dir or config.HF_DATASETS_CACHE
|
| 653 |
+
|
| 654 |
+
def write_from_pcollection(self, pcoll_examples):
|
| 655 |
+
"""Add the final steps of the beam pipeline: write to parquet files."""
|
| 656 |
+
import apache_beam as beam
|
| 657 |
+
|
| 658 |
+
def inc_num_examples(example):
|
| 659 |
+
beam.metrics.Metrics.counter(self._namespace, "num_examples").inc()
|
| 660 |
+
|
| 661 |
+
# count examples
|
| 662 |
+
_ = pcoll_examples | "Count N. Examples" >> beam.Map(inc_num_examples)
|
| 663 |
+
|
| 664 |
+
# save dataset
|
| 665 |
+
return (
|
| 666 |
+
pcoll_examples
|
| 667 |
+
| "Get values" >> beam.Values()
|
| 668 |
+
| "Save to parquet"
|
| 669 |
+
>> beam.io.parquetio.WriteToParquet(
|
| 670 |
+
self._parquet_path, self._schema, shard_name_template="-SSSSS-of-NNNNN.parquet"
|
| 671 |
+
)
|
| 672 |
+
)
|
| 673 |
+
|
| 674 |
+
def finalize(self, metrics_query_result: dict):
|
| 675 |
+
"""
|
| 676 |
+
Run after the pipeline has finished.
|
| 677 |
+
It converts the resulting parquet files to arrow and it completes the info from the pipeline metrics.
|
| 678 |
+
|
| 679 |
+
Args:
|
| 680 |
+
metrics_query_result: `dict` obtained from pipeline_results.metrics().query(m_filter). Make sure
|
| 681 |
+
that the filter keeps only the metrics for the considered split, under the namespace `split_name`.
|
| 682 |
+
"""
|
| 683 |
+
|
| 684 |
+
# Beam FileSystems require the system's path separator in the older versions
|
| 685 |
+
fs, parquet_path = url_to_fs(self._parquet_path)
|
| 686 |
+
parquet_path = str(Path(parquet_path)) if not is_remote_filesystem(fs) else fs.unstrip_protocol(parquet_path)
|
| 687 |
+
|
| 688 |
+
shards = fs.glob(parquet_path + "*.parquet")
|
| 689 |
+
num_bytes = sum(fs.sizes(shards))
|
| 690 |
+
shard_lengths = get_parquet_lengths(shards)
|
| 691 |
+
|
| 692 |
+
# Convert to arrow
|
| 693 |
+
if self._path.endswith(".arrow"):
|
| 694 |
+
logger.info(f"Converting parquet files {self._parquet_path} to arrow {self._path}")
|
| 695 |
+
try: # stream conversion
|
| 696 |
+
num_bytes = 0
|
| 697 |
+
for shard in hf_tqdm(shards, unit="shards"):
|
| 698 |
+
with fs.open(shard, "rb") as source:
|
| 699 |
+
with fs.open(shard.replace(".parquet", ".arrow"), "wb") as destination:
|
| 700 |
+
shard_num_bytes, _ = parquet_to_arrow(source, destination)
|
| 701 |
+
num_bytes += shard_num_bytes
|
| 702 |
+
except OSError as e: # broken pipe can happen if the connection is unstable, do local conversion instead
|
| 703 |
+
if e.errno != errno.EPIPE: # not a broken pipe
|
| 704 |
+
raise
|
| 705 |
+
logger.warning(
|
| 706 |
+
"Broken Pipe during stream conversion from parquet to arrow. Using local convert instead"
|
| 707 |
+
)
|
| 708 |
+
local_convert_dir = os.path.join(self._cache_dir, "beam_convert")
|
| 709 |
+
os.makedirs(local_convert_dir, exist_ok=True)
|
| 710 |
+
num_bytes = 0
|
| 711 |
+
for shard in hf_tqdm(shards, unit="shards"):
|
| 712 |
+
local_parquet_path = os.path.join(local_convert_dir, hash_url_to_filename(shard) + ".parquet")
|
| 713 |
+
fs.download(shard, local_parquet_path)
|
| 714 |
+
local_arrow_path = local_parquet_path.replace(".parquet", ".arrow")
|
| 715 |
+
shard_num_bytes, _ = parquet_to_arrow(local_parquet_path, local_arrow_path)
|
| 716 |
+
num_bytes += shard_num_bytes
|
| 717 |
+
remote_arrow_path = shard.replace(".parquet", ".arrow")
|
| 718 |
+
fs.upload(local_arrow_path, remote_arrow_path)
|
| 719 |
+
|
| 720 |
+
# Save metrics
|
| 721 |
+
counters_dict = {metric.key.metric.name: metric.result for metric in metrics_query_result["counters"]}
|
| 722 |
+
self._num_examples = counters_dict["num_examples"]
|
| 723 |
+
self._num_bytes = num_bytes
|
| 724 |
+
self._shard_lengths = shard_lengths
|
| 725 |
+
return self._num_examples, self._num_bytes
|
| 726 |
+
|
| 727 |
+
|
| 728 |
+
def get_parquet_lengths(sources) -> List[int]:
|
| 729 |
+
shard_lengths = []
|
| 730 |
+
for source in hf_tqdm(sources, unit="parquet files"):
|
| 731 |
+
parquet_file = pa.parquet.ParquetFile(source)
|
| 732 |
+
shard_lengths.append(parquet_file.metadata.num_rows)
|
| 733 |
+
return shard_lengths
|
| 734 |
+
|
| 735 |
+
|
| 736 |
+
def parquet_to_arrow(source, destination) -> List[int]:
|
| 737 |
+
"""Convert parquet file to arrow file. Inputs can be str paths or file-like objects"""
|
| 738 |
+
stream = None if isinstance(destination, str) else destination
|
| 739 |
+
parquet_file = pa.parquet.ParquetFile(source)
|
| 740 |
+
# Beam can create empty Parquet files, so we need to pass the source Parquet file's schema
|
| 741 |
+
with ArrowWriter(schema=parquet_file.schema_arrow, path=destination, stream=stream) as writer:
|
| 742 |
+
for record_batch in parquet_file.iter_batches():
|
| 743 |
+
pa_table = pa.Table.from_batches([record_batch])
|
| 744 |
+
writer.write_table(pa_table)
|
| 745 |
+
num_bytes, num_examples = writer.finalize()
|
| 746 |
+
return num_bytes, num_examples
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/combine.py
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Optional, TypeVar
|
| 2 |
+
|
| 3 |
+
from .arrow_dataset import Dataset, _concatenate_map_style_datasets, _interleave_map_style_datasets
|
| 4 |
+
from .dataset_dict import DatasetDict, IterableDatasetDict
|
| 5 |
+
from .info import DatasetInfo
|
| 6 |
+
from .iterable_dataset import IterableDataset, _concatenate_iterable_datasets, _interleave_iterable_datasets
|
| 7 |
+
from .splits import NamedSplit
|
| 8 |
+
from .utils import logging
|
| 9 |
+
from .utils.py_utils import Literal
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
logger = logging.get_logger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
DatasetType = TypeVar("DatasetType", Dataset, IterableDataset)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def interleave_datasets(
|
| 19 |
+
datasets: List[DatasetType],
|
| 20 |
+
probabilities: Optional[List[float]] = None,
|
| 21 |
+
seed: Optional[int] = None,
|
| 22 |
+
info: Optional[DatasetInfo] = None,
|
| 23 |
+
split: Optional[NamedSplit] = None,
|
| 24 |
+
stopping_strategy: Literal["first_exhausted", "all_exhausted"] = "first_exhausted",
|
| 25 |
+
) -> DatasetType:
|
| 26 |
+
"""
|
| 27 |
+
Interleave several datasets (sources) into a single dataset.
|
| 28 |
+
The new dataset is constructed by alternating between the sources to get the examples.
|
| 29 |
+
|
| 30 |
+
You can use this function on a list of [`Dataset`] objects, or on a list of [`IterableDataset`] objects.
|
| 31 |
+
|
| 32 |
+
- If `probabilities` is `None` (default) the new dataset is constructed by cycling between each source to get the examples.
|
| 33 |
+
- If `probabilities` is not `None`, the new dataset is constructed by getting examples from a random source at a time according to the provided probabilities.
|
| 34 |
+
|
| 35 |
+
The resulting dataset ends when one of the source datasets runs out of examples except when `oversampling` is `True`,
|
| 36 |
+
in which case, the resulting dataset ends when all datasets have ran out of examples at least one time.
|
| 37 |
+
|
| 38 |
+
Note for iterable datasets:
|
| 39 |
+
|
| 40 |
+
In a distributed setup or in PyTorch DataLoader workers, the stopping strategy is applied per process.
|
| 41 |
+
Therefore the "first_exhausted" strategy on an sharded iterable dataset can generate less samples in total (up to 1 missing sample per subdataset per worker).
|
| 42 |
+
|
| 43 |
+
Args:
|
| 44 |
+
datasets (`List[Dataset]` or `List[IterableDataset]`):
|
| 45 |
+
List of datasets to interleave.
|
| 46 |
+
probabilities (`List[float]`, *optional*, defaults to `None`):
|
| 47 |
+
If specified, the new dataset is constructed by sampling
|
| 48 |
+
examples from one source at a time according to these probabilities.
|
| 49 |
+
seed (`int`, *optional*, defaults to `None`):
|
| 50 |
+
The random seed used to choose a source for each example.
|
| 51 |
+
info ([`DatasetInfo`], *optional*):
|
| 52 |
+
Dataset information, like description, citation, etc.
|
| 53 |
+
<Added version="2.4.0"/>
|
| 54 |
+
split ([`NamedSplit`], *optional*):
|
| 55 |
+
Name of the dataset split.
|
| 56 |
+
<Added version="2.4.0"/>
|
| 57 |
+
stopping_strategy (`str`, defaults to `first_exhausted`):
|
| 58 |
+
Two strategies are proposed right now, `first_exhausted` and `all_exhausted`.
|
| 59 |
+
By default, `first_exhausted` is an undersampling strategy, i.e the dataset construction is stopped as soon as one dataset has ran out of samples.
|
| 60 |
+
If the strategy is `all_exhausted`, we use an oversampling strategy, i.e the dataset construction is stopped as soon as every samples of every dataset has been added at least once.
|
| 61 |
+
Note that if the strategy is `all_exhausted`, the interleaved dataset size can get enormous:
|
| 62 |
+
- with no probabilities, the resulting dataset will have `max_length_datasets*nb_dataset` samples.
|
| 63 |
+
- with given probabilities, the resulting dataset will have more samples if some datasets have really low probability of visiting.
|
| 64 |
+
Returns:
|
| 65 |
+
[`Dataset`] or [`IterableDataset`]: Return type depends on the input `datasets`
|
| 66 |
+
parameter. `Dataset` if the input is a list of `Dataset`, `IterableDataset` if the input is a list of
|
| 67 |
+
`IterableDataset`.
|
| 68 |
+
|
| 69 |
+
Example:
|
| 70 |
+
|
| 71 |
+
For regular datasets (map-style):
|
| 72 |
+
|
| 73 |
+
```python
|
| 74 |
+
>>> from datasets import Dataset, interleave_datasets
|
| 75 |
+
>>> d1 = Dataset.from_dict({"a": [0, 1, 2]})
|
| 76 |
+
>>> d2 = Dataset.from_dict({"a": [10, 11, 12]})
|
| 77 |
+
>>> d3 = Dataset.from_dict({"a": [20, 21, 22]})
|
| 78 |
+
>>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42, stopping_strategy="all_exhausted")
|
| 79 |
+
>>> dataset["a"]
|
| 80 |
+
[10, 0, 11, 1, 2, 20, 12, 10, 0, 1, 2, 21, 0, 11, 1, 2, 0, 1, 12, 2, 10, 0, 22]
|
| 81 |
+
>>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42)
|
| 82 |
+
>>> dataset["a"]
|
| 83 |
+
[10, 0, 11, 1, 2]
|
| 84 |
+
>>> dataset = interleave_datasets([d1, d2, d3])
|
| 85 |
+
>>> dataset["a"]
|
| 86 |
+
[0, 10, 20, 1, 11, 21, 2, 12, 22]
|
| 87 |
+
>>> dataset = interleave_datasets([d1, d2, d3], stopping_strategy="all_exhausted")
|
| 88 |
+
>>> dataset["a"]
|
| 89 |
+
[0, 10, 20, 1, 11, 21, 2, 12, 22]
|
| 90 |
+
>>> d1 = Dataset.from_dict({"a": [0, 1, 2]})
|
| 91 |
+
>>> d2 = Dataset.from_dict({"a": [10, 11, 12, 13]})
|
| 92 |
+
>>> d3 = Dataset.from_dict({"a": [20, 21, 22, 23, 24]})
|
| 93 |
+
>>> dataset = interleave_datasets([d1, d2, d3])
|
| 94 |
+
>>> dataset["a"]
|
| 95 |
+
[0, 10, 20, 1, 11, 21, 2, 12, 22]
|
| 96 |
+
>>> dataset = interleave_datasets([d1, d2, d3], stopping_strategy="all_exhausted")
|
| 97 |
+
>>> dataset["a"]
|
| 98 |
+
[0, 10, 20, 1, 11, 21, 2, 12, 22, 0, 13, 23, 1, 10, 24]
|
| 99 |
+
>>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42)
|
| 100 |
+
>>> dataset["a"]
|
| 101 |
+
[10, 0, 11, 1, 2]
|
| 102 |
+
>>> dataset = interleave_datasets([d1, d2, d3], probabilities=[0.7, 0.2, 0.1], seed=42, stopping_strategy="all_exhausted")
|
| 103 |
+
>>> dataset["a"]
|
| 104 |
+
[10, 0, 11, 1, 2, 20, 12, 13, ..., 0, 1, 2, 0, 24]
|
| 105 |
+
For datasets in streaming mode (iterable):
|
| 106 |
+
|
| 107 |
+
>>> from datasets import load_dataset, interleave_datasets
|
| 108 |
+
>>> d1 = load_dataset("oscar", "unshuffled_deduplicated_en", split="train", streaming=True)
|
| 109 |
+
>>> d2 = load_dataset("oscar", "unshuffled_deduplicated_fr", split="train", streaming=True)
|
| 110 |
+
>>> dataset = interleave_datasets([d1, d2])
|
| 111 |
+
>>> iterator = iter(dataset)
|
| 112 |
+
>>> next(iterator)
|
| 113 |
+
{'text': 'Mtendere Village was inspired by the vision...}
|
| 114 |
+
>>> next(iterator)
|
| 115 |
+
{'text': "Média de débat d'idées, de culture...}
|
| 116 |
+
```
|
| 117 |
+
"""
|
| 118 |
+
from .arrow_dataset import Dataset
|
| 119 |
+
from .iterable_dataset import IterableDataset
|
| 120 |
+
|
| 121 |
+
if not datasets:
|
| 122 |
+
raise ValueError("Unable to interleave an empty list of datasets.")
|
| 123 |
+
for i, dataset in enumerate(datasets):
|
| 124 |
+
if not isinstance(dataset, (Dataset, IterableDataset)):
|
| 125 |
+
if isinstance(dataset, (DatasetDict, IterableDatasetDict)):
|
| 126 |
+
if not dataset:
|
| 127 |
+
raise ValueError(
|
| 128 |
+
f"Expected a list of Dataset objects or a list of IterableDataset objects, but element at position {i} "
|
| 129 |
+
"is an empty dataset dictionary."
|
| 130 |
+
)
|
| 131 |
+
raise ValueError(
|
| 132 |
+
f"Dataset at position {i} has at least one split: {list(dataset)}\n"
|
| 133 |
+
f"Please pick one to interleave with the other datasets, for example: dataset['{next(iter(dataset))}']"
|
| 134 |
+
)
|
| 135 |
+
raise ValueError(
|
| 136 |
+
f"Expected a list of Dataset objects or a list of IterableDataset objects, but element at position {i} is a {type(dataset).__name__}."
|
| 137 |
+
)
|
| 138 |
+
if i == 0:
|
| 139 |
+
dataset_type, other_type = (
|
| 140 |
+
(Dataset, IterableDataset) if isinstance(dataset, Dataset) else (IterableDataset, Dataset)
|
| 141 |
+
)
|
| 142 |
+
elif not isinstance(dataset, dataset_type):
|
| 143 |
+
raise ValueError(
|
| 144 |
+
f"Unable to interleave a {dataset_type.__name__} (at position 0) with a {other_type.__name__} (at position {i}). Expected a list of Dataset objects or a list of IterableDataset objects."
|
| 145 |
+
)
|
| 146 |
+
if stopping_strategy not in ["first_exhausted", "all_exhausted"]:
|
| 147 |
+
raise ValueError(f"{stopping_strategy} is not supported. Please enter a valid stopping_strategy.")
|
| 148 |
+
if dataset_type is Dataset:
|
| 149 |
+
return _interleave_map_style_datasets(
|
| 150 |
+
datasets, probabilities, seed, info=info, split=split, stopping_strategy=stopping_strategy
|
| 151 |
+
)
|
| 152 |
+
else:
|
| 153 |
+
return _interleave_iterable_datasets(
|
| 154 |
+
datasets, probabilities, seed, info=info, split=split, stopping_strategy=stopping_strategy
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def concatenate_datasets(
|
| 159 |
+
dsets: List[DatasetType],
|
| 160 |
+
info: Optional[DatasetInfo] = None,
|
| 161 |
+
split: Optional[NamedSplit] = None,
|
| 162 |
+
axis: int = 0,
|
| 163 |
+
) -> DatasetType:
|
| 164 |
+
"""
|
| 165 |
+
Converts a list of [`Dataset`] with the same schema into a single [`Dataset`].
|
| 166 |
+
|
| 167 |
+
Args:
|
| 168 |
+
dsets (`List[datasets.Dataset]`):
|
| 169 |
+
List of Datasets to concatenate.
|
| 170 |
+
info (`DatasetInfo`, *optional*):
|
| 171 |
+
Dataset information, like description, citation, etc.
|
| 172 |
+
split (`NamedSplit`, *optional*):
|
| 173 |
+
Name of the dataset split.
|
| 174 |
+
axis (`{0, 1}`, defaults to `0`):
|
| 175 |
+
Axis to concatenate over, where `0` means over rows (vertically) and `1` means over columns
|
| 176 |
+
(horizontally).
|
| 177 |
+
|
| 178 |
+
<Added version="1.6.0"/>
|
| 179 |
+
|
| 180 |
+
Example:
|
| 181 |
+
|
| 182 |
+
```py
|
| 183 |
+
>>> ds3 = concatenate_datasets([ds1, ds2])
|
| 184 |
+
```
|
| 185 |
+
"""
|
| 186 |
+
|
| 187 |
+
if not dsets:
|
| 188 |
+
raise ValueError("Unable to concatenate an empty list of datasets.")
|
| 189 |
+
for i, dataset in enumerate(dsets):
|
| 190 |
+
if not isinstance(dataset, (Dataset, IterableDataset)):
|
| 191 |
+
if isinstance(dataset, (DatasetDict, IterableDatasetDict)):
|
| 192 |
+
if not dataset:
|
| 193 |
+
raise ValueError(
|
| 194 |
+
f"Expected a list of Dataset objects or a list of IterableDataset objects, but element at position {i} "
|
| 195 |
+
"is an empty dataset dictionary."
|
| 196 |
+
)
|
| 197 |
+
raise ValueError(
|
| 198 |
+
f"Dataset at position {i} has at least one split: {list(dataset)}\n"
|
| 199 |
+
f"Please pick one to interleave with the other datasets, for example: dataset['{next(iter(dataset))}']"
|
| 200 |
+
)
|
| 201 |
+
raise ValueError(
|
| 202 |
+
f"Expected a list of Dataset objects or a list of IterableDataset objects, but element at position {i} is a {type(dataset).__name__}."
|
| 203 |
+
)
|
| 204 |
+
if i == 0:
|
| 205 |
+
dataset_type, other_type = (
|
| 206 |
+
(Dataset, IterableDataset) if isinstance(dataset, Dataset) else (IterableDataset, Dataset)
|
| 207 |
+
)
|
| 208 |
+
elif not isinstance(dataset, dataset_type):
|
| 209 |
+
raise ValueError(
|
| 210 |
+
f"Unable to interleave a {dataset_type.__name__} (at position 0) with a {other_type.__name__} (at position {i}). Expected a list of Dataset objects or a list of IterableDataset objects."
|
| 211 |
+
)
|
| 212 |
+
if dataset_type is Dataset:
|
| 213 |
+
return _concatenate_map_style_datasets(dsets, info=info, split=split, axis=axis)
|
| 214 |
+
else:
|
| 215 |
+
return _concatenate_iterable_datasets(dsets, info=info, split=split, axis=axis)
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/data_files.py
ADDED
|
@@ -0,0 +1,825 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
from functools import partial
|
| 4 |
+
from glob import has_magic
|
| 5 |
+
from pathlib import Path, PurePath
|
| 6 |
+
from typing import Callable, Dict, List, Optional, Set, Tuple, Union
|
| 7 |
+
|
| 8 |
+
import huggingface_hub
|
| 9 |
+
from fsspec.core import url_to_fs
|
| 10 |
+
from fsspec.implementations.http import HTTPFileSystem
|
| 11 |
+
from huggingface_hub import HfFileSystem
|
| 12 |
+
from packaging import version
|
| 13 |
+
from tqdm.contrib.concurrent import thread_map
|
| 14 |
+
|
| 15 |
+
from . import config
|
| 16 |
+
from .download import DownloadConfig
|
| 17 |
+
from .naming import _split_re
|
| 18 |
+
from .splits import Split
|
| 19 |
+
from .utils import logging
|
| 20 |
+
from .utils import tqdm as hf_tqdm
|
| 21 |
+
from .utils.file_utils import _prepare_path_and_storage_options, is_local_path, is_relative_path, xbasename, xjoin
|
| 22 |
+
from .utils.py_utils import glob_pattern_to_regex, string_to_dict
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
SingleOriginMetadata = Union[Tuple[str, str], Tuple[str], Tuple[()]]
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
SANITIZED_DEFAULT_SPLIT = str(Split.TRAIN)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
logger = logging.get_logger(__name__)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class Url(str):
|
| 35 |
+
pass
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class EmptyDatasetError(FileNotFoundError):
|
| 39 |
+
pass
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
SPLIT_PATTERN_SHARDED = "data/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*"
|
| 43 |
+
|
| 44 |
+
SPLIT_KEYWORDS = {
|
| 45 |
+
Split.TRAIN: ["train", "training"],
|
| 46 |
+
Split.VALIDATION: ["validation", "valid", "dev", "val"],
|
| 47 |
+
Split.TEST: ["test", "testing", "eval", "evaluation"],
|
| 48 |
+
}
|
| 49 |
+
NON_WORDS_CHARS = "-._ 0-9"
|
| 50 |
+
if config.FSSPEC_VERSION < version.parse("2023.9.0"):
|
| 51 |
+
KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**[{sep}/]{keyword}[{sep}]*", "{keyword}[{sep}]*"]
|
| 52 |
+
KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [
|
| 53 |
+
"{keyword}/**",
|
| 54 |
+
"{keyword}[{sep}]*/**",
|
| 55 |
+
"**[{sep}/]{keyword}/**",
|
| 56 |
+
"**[{sep}/]{keyword}[{sep}]*/**",
|
| 57 |
+
]
|
| 58 |
+
elif config.FSSPEC_VERSION < version.parse("2023.12.0"):
|
| 59 |
+
KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**/*[{sep}/]{keyword}[{sep}]*", "{keyword}[{sep}]*"]
|
| 60 |
+
KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [
|
| 61 |
+
"{keyword}/**/*",
|
| 62 |
+
"{keyword}[{sep}]*/**/*",
|
| 63 |
+
"**/*[{sep}/]{keyword}/**/*",
|
| 64 |
+
"**/*[{sep}/]{keyword}[{sep}]*/**/*",
|
| 65 |
+
]
|
| 66 |
+
else:
|
| 67 |
+
KEYWORDS_IN_FILENAME_BASE_PATTERNS = ["**/{keyword}[{sep}]*", "**/*[{sep}]{keyword}[{sep}]*"]
|
| 68 |
+
KEYWORDS_IN_DIR_NAME_BASE_PATTERNS = [
|
| 69 |
+
"**/{keyword}/**",
|
| 70 |
+
"**/{keyword}[{sep}]*/**",
|
| 71 |
+
"**/*[{sep}]{keyword}/**",
|
| 72 |
+
"**/*[{sep}]{keyword}[{sep}]*/**",
|
| 73 |
+
]
|
| 74 |
+
|
| 75 |
+
DEFAULT_SPLITS = [Split.TRAIN, Split.VALIDATION, Split.TEST]
|
| 76 |
+
DEFAULT_PATTERNS_SPLIT_IN_FILENAME = {
|
| 77 |
+
split: [
|
| 78 |
+
pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
|
| 79 |
+
for keyword in SPLIT_KEYWORDS[split]
|
| 80 |
+
for pattern in KEYWORDS_IN_FILENAME_BASE_PATTERNS
|
| 81 |
+
]
|
| 82 |
+
for split in DEFAULT_SPLITS
|
| 83 |
+
}
|
| 84 |
+
DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME = {
|
| 85 |
+
split: [
|
| 86 |
+
pattern.format(keyword=keyword, sep=NON_WORDS_CHARS)
|
| 87 |
+
for keyword in SPLIT_KEYWORDS[split]
|
| 88 |
+
for pattern in KEYWORDS_IN_DIR_NAME_BASE_PATTERNS
|
| 89 |
+
]
|
| 90 |
+
for split in DEFAULT_SPLITS
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
DEFAULT_PATTERNS_ALL = {
|
| 95 |
+
Split.TRAIN: ["**"],
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
ALL_SPLIT_PATTERNS = [SPLIT_PATTERN_SHARDED]
|
| 99 |
+
ALL_DEFAULT_PATTERNS = [
|
| 100 |
+
DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME,
|
| 101 |
+
DEFAULT_PATTERNS_SPLIT_IN_FILENAME,
|
| 102 |
+
DEFAULT_PATTERNS_ALL,
|
| 103 |
+
]
|
| 104 |
+
if config.FSSPEC_VERSION < version.parse("2023.9.0"):
|
| 105 |
+
METADATA_PATTERNS = [
|
| 106 |
+
"metadata.csv",
|
| 107 |
+
"**/metadata.csv",
|
| 108 |
+
"metadata.jsonl",
|
| 109 |
+
"**/metadata.jsonl",
|
| 110 |
+
] # metadata file for ImageFolder and AudioFolder
|
| 111 |
+
else:
|
| 112 |
+
METADATA_PATTERNS = [
|
| 113 |
+
"**/metadata.csv",
|
| 114 |
+
"**/metadata.jsonl",
|
| 115 |
+
] # metadata file for ImageFolder and AudioFolder
|
| 116 |
+
WILDCARD_CHARACTERS = "*[]"
|
| 117 |
+
FILES_TO_IGNORE = [
|
| 118 |
+
"README.md",
|
| 119 |
+
"config.json",
|
| 120 |
+
"dataset_info.json",
|
| 121 |
+
"dataset_infos.json",
|
| 122 |
+
"dummy_data.zip",
|
| 123 |
+
"dataset_dict.json",
|
| 124 |
+
]
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def contains_wildcards(pattern: str) -> bool:
|
| 128 |
+
return any(wilcard_character in pattern for wilcard_character in WILDCARD_CHARACTERS)
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def sanitize_patterns(patterns: Union[Dict, List, str]) -> Dict[str, Union[List[str], "DataFilesList"]]:
|
| 132 |
+
"""
|
| 133 |
+
Take the data_files patterns from the user, and format them into a dictionary.
|
| 134 |
+
Each key is the name of the split, and each value is a list of data files patterns (paths or urls).
|
| 135 |
+
The default split is "train".
|
| 136 |
+
|
| 137 |
+
Returns:
|
| 138 |
+
patterns: dictionary of split_name -> list of patterns
|
| 139 |
+
"""
|
| 140 |
+
if isinstance(patterns, dict):
|
| 141 |
+
return {str(key): value if isinstance(value, list) else [value] for key, value in patterns.items()}
|
| 142 |
+
elif isinstance(patterns, str):
|
| 143 |
+
return {SANITIZED_DEFAULT_SPLIT: [patterns]}
|
| 144 |
+
elif isinstance(patterns, list):
|
| 145 |
+
if any(isinstance(pattern, dict) for pattern in patterns):
|
| 146 |
+
for pattern in patterns:
|
| 147 |
+
if not (
|
| 148 |
+
isinstance(pattern, dict)
|
| 149 |
+
and len(pattern) == 2
|
| 150 |
+
and "split" in pattern
|
| 151 |
+
and isinstance(pattern.get("path"), (str, list))
|
| 152 |
+
):
|
| 153 |
+
raise ValueError(
|
| 154 |
+
f"Expected each split to have a 'path' key which can be a string or a list of strings, but got {pattern}"
|
| 155 |
+
)
|
| 156 |
+
splits = [pattern["split"] for pattern in patterns]
|
| 157 |
+
if len(set(splits)) != len(splits):
|
| 158 |
+
raise ValueError(f"Some splits are duplicated in data_files: {splits}")
|
| 159 |
+
return {
|
| 160 |
+
str(pattern["split"]): pattern["path"] if isinstance(pattern["path"], list) else [pattern["path"]]
|
| 161 |
+
for pattern in patterns
|
| 162 |
+
}
|
| 163 |
+
else:
|
| 164 |
+
return {SANITIZED_DEFAULT_SPLIT: patterns}
|
| 165 |
+
else:
|
| 166 |
+
return sanitize_patterns(list(patterns))
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def _is_inside_unrequested_special_dir(matched_rel_path: str, pattern: str) -> bool:
|
| 170 |
+
"""
|
| 171 |
+
When a path matches a pattern, we additionnally check if it's inside a special directory
|
| 172 |
+
we ignore by default (if it starts with a double underscore).
|
| 173 |
+
|
| 174 |
+
Users can still explicitly request a filepath inside such a directory if "__pycache__" is
|
| 175 |
+
mentioned explicitly in the requested pattern.
|
| 176 |
+
|
| 177 |
+
Some examples:
|
| 178 |
+
|
| 179 |
+
base directory:
|
| 180 |
+
|
| 181 |
+
./
|
| 182 |
+
└── __pycache__
|
| 183 |
+
└── b.txt
|
| 184 |
+
|
| 185 |
+
>>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "**")
|
| 186 |
+
True
|
| 187 |
+
>>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "*/b.txt")
|
| 188 |
+
True
|
| 189 |
+
>>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__pycache__/*")
|
| 190 |
+
False
|
| 191 |
+
>>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__*/*")
|
| 192 |
+
False
|
| 193 |
+
"""
|
| 194 |
+
# We just need to check if every special directories from the path is present explicly in the pattern.
|
| 195 |
+
# Since we assume that the path matches the pattern, it's equivalent to counting that both
|
| 196 |
+
# the parent path and the parent pattern have the same number of special directories.
|
| 197 |
+
data_dirs_to_ignore_in_path = [part for part in PurePath(matched_rel_path).parent.parts if part.startswith("__")]
|
| 198 |
+
data_dirs_to_ignore_in_pattern = [part for part in PurePath(pattern).parent.parts if part.startswith("__")]
|
| 199 |
+
return len(data_dirs_to_ignore_in_path) != len(data_dirs_to_ignore_in_pattern)
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
def _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(matched_rel_path: str, pattern: str) -> bool:
|
| 203 |
+
"""
|
| 204 |
+
When a path matches a pattern, we additionnally check if it's a hidden file or if it's inside
|
| 205 |
+
a hidden directory we ignore by default, i.e. if the file name or a parent directory name starts with a dot.
|
| 206 |
+
|
| 207 |
+
Users can still explicitly request a filepath that is hidden or is inside a hidden directory
|
| 208 |
+
if the hidden part is mentioned explicitly in the requested pattern.
|
| 209 |
+
|
| 210 |
+
Some examples:
|
| 211 |
+
|
| 212 |
+
base directory:
|
| 213 |
+
|
| 214 |
+
./
|
| 215 |
+
└── .hidden_file.txt
|
| 216 |
+
|
| 217 |
+
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", "**")
|
| 218 |
+
True
|
| 219 |
+
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", ".*")
|
| 220 |
+
False
|
| 221 |
+
|
| 222 |
+
base directory:
|
| 223 |
+
|
| 224 |
+
./
|
| 225 |
+
└── .hidden_dir
|
| 226 |
+
└── a.txt
|
| 227 |
+
|
| 228 |
+
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", "**")
|
| 229 |
+
True
|
| 230 |
+
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".*/*")
|
| 231 |
+
False
|
| 232 |
+
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".hidden_dir/*")
|
| 233 |
+
False
|
| 234 |
+
|
| 235 |
+
base directory:
|
| 236 |
+
|
| 237 |
+
./
|
| 238 |
+
└── .hidden_dir
|
| 239 |
+
└── .hidden_file.txt
|
| 240 |
+
|
| 241 |
+
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", "**")
|
| 242 |
+
True
|
| 243 |
+
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/*")
|
| 244 |
+
True
|
| 245 |
+
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/.*")
|
| 246 |
+
False
|
| 247 |
+
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/*")
|
| 248 |
+
True
|
| 249 |
+
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/.*")
|
| 250 |
+
False
|
| 251 |
+
"""
|
| 252 |
+
# We just need to check if every hidden part from the path is present explicly in the pattern.
|
| 253 |
+
# Since we assume that the path matches the pattern, it's equivalent to counting that both
|
| 254 |
+
# the path and the pattern have the same number of hidden parts.
|
| 255 |
+
hidden_directories_in_path = [
|
| 256 |
+
part for part in PurePath(matched_rel_path).parts if part.startswith(".") and not set(part) == {"."}
|
| 257 |
+
]
|
| 258 |
+
hidden_directories_in_pattern = [
|
| 259 |
+
part for part in PurePath(pattern).parts if part.startswith(".") and not set(part) == {"."}
|
| 260 |
+
]
|
| 261 |
+
return len(hidden_directories_in_path) != len(hidden_directories_in_pattern)
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
def _get_data_files_patterns(pattern_resolver: Callable[[str], List[str]]) -> Dict[str, List[str]]:
|
| 265 |
+
"""
|
| 266 |
+
Get the default pattern from a directory or repository by testing all the supported patterns.
|
| 267 |
+
The first patterns to return a non-empty list of data files is returned.
|
| 268 |
+
|
| 269 |
+
In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
|
| 270 |
+
"""
|
| 271 |
+
# first check the split patterns like data/{split}-00000-of-00001.parquet
|
| 272 |
+
for split_pattern in ALL_SPLIT_PATTERNS:
|
| 273 |
+
pattern = split_pattern.replace("{split}", "*")
|
| 274 |
+
try:
|
| 275 |
+
data_files = pattern_resolver(pattern)
|
| 276 |
+
except FileNotFoundError:
|
| 277 |
+
continue
|
| 278 |
+
if len(data_files) > 0:
|
| 279 |
+
splits: Set[str] = {
|
| 280 |
+
string_to_dict(xbasename(p), glob_pattern_to_regex(xbasename(split_pattern)))["split"]
|
| 281 |
+
for p in data_files
|
| 282 |
+
}
|
| 283 |
+
if any(not re.match(_split_re, split) for split in splits):
|
| 284 |
+
raise ValueError(f"Split name should match '{_split_re}'' but got '{splits}'.")
|
| 285 |
+
sorted_splits = [str(split) for split in DEFAULT_SPLITS if split in splits] + sorted(
|
| 286 |
+
splits - set(DEFAULT_SPLITS)
|
| 287 |
+
)
|
| 288 |
+
return {split: [split_pattern.format(split=split)] for split in sorted_splits}
|
| 289 |
+
# then check the default patterns based on train/valid/test splits
|
| 290 |
+
for patterns_dict in ALL_DEFAULT_PATTERNS:
|
| 291 |
+
non_empty_splits = []
|
| 292 |
+
for split, patterns in patterns_dict.items():
|
| 293 |
+
for pattern in patterns:
|
| 294 |
+
try:
|
| 295 |
+
data_files = pattern_resolver(pattern)
|
| 296 |
+
except FileNotFoundError:
|
| 297 |
+
continue
|
| 298 |
+
if len(data_files) > 0:
|
| 299 |
+
non_empty_splits.append(split)
|
| 300 |
+
break
|
| 301 |
+
if non_empty_splits:
|
| 302 |
+
return {split: patterns_dict[split] for split in non_empty_splits}
|
| 303 |
+
raise FileNotFoundError(f"Couldn't resolve pattern {pattern} with resolver {pattern_resolver}")
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
def _get_metadata_files_patterns(pattern_resolver: Callable[[str], List[str]]) -> List[str]:
|
| 307 |
+
"""
|
| 308 |
+
Get the supported metadata patterns from a directory or repository.
|
| 309 |
+
"""
|
| 310 |
+
non_empty_patterns = []
|
| 311 |
+
for pattern in METADATA_PATTERNS:
|
| 312 |
+
try:
|
| 313 |
+
metadata_files = pattern_resolver(pattern)
|
| 314 |
+
if len(metadata_files) > 0:
|
| 315 |
+
non_empty_patterns.append(pattern)
|
| 316 |
+
except FileNotFoundError:
|
| 317 |
+
pass
|
| 318 |
+
if non_empty_patterns:
|
| 319 |
+
return non_empty_patterns
|
| 320 |
+
raise FileNotFoundError(f"Couldn't resolve pattern {pattern} with resolver {pattern_resolver}")
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
def resolve_pattern(
|
| 324 |
+
pattern: str,
|
| 325 |
+
base_path: str,
|
| 326 |
+
allowed_extensions: Optional[List[str]] = None,
|
| 327 |
+
download_config: Optional[DownloadConfig] = None,
|
| 328 |
+
) -> List[str]:
|
| 329 |
+
"""
|
| 330 |
+
Resolve the paths and URLs of the data files from the pattern passed by the user.
|
| 331 |
+
|
| 332 |
+
You can use patterns to resolve multiple local files. Here are a few examples:
|
| 333 |
+
- *.csv to match all the CSV files at the first level
|
| 334 |
+
- **.csv to match all the CSV files at any level
|
| 335 |
+
- data/* to match all the files inside "data"
|
| 336 |
+
- data/** to match all the files inside "data" and its subdirectories
|
| 337 |
+
|
| 338 |
+
The patterns are resolved using the fsspec glob. In fsspec>=2023.12.0 this is equivalent to
|
| 339 |
+
Python's glob.glob, Path.glob, Path.match and fnmatch where ** is unsupported with a prefix/suffix
|
| 340 |
+
other than a forward slash /.
|
| 341 |
+
|
| 342 |
+
More generally:
|
| 343 |
+
- '*' matches any character except a forward-slash (to match just the file or directory name)
|
| 344 |
+
- '**' matches any character including a forward-slash /
|
| 345 |
+
|
| 346 |
+
Hidden files and directories (i.e. whose names start with a dot) are ignored, unless they are explicitly requested.
|
| 347 |
+
The same applies to special directories that start with a double underscore like "__pycache__".
|
| 348 |
+
You can still include one if the pattern explicilty mentions it:
|
| 349 |
+
- to include a hidden file: "*/.hidden.txt" or "*/.*"
|
| 350 |
+
- to include a hidden directory: ".hidden/*" or ".*/*"
|
| 351 |
+
- to include a special directory: "__special__/*" or "__*/*"
|
| 352 |
+
|
| 353 |
+
Example::
|
| 354 |
+
|
| 355 |
+
>>> from datasets.data_files import resolve_pattern
|
| 356 |
+
>>> base_path = "."
|
| 357 |
+
>>> resolve_pattern("docs/**/*.py", base_path)
|
| 358 |
+
[/Users/mariosasko/Desktop/projects/datasets/docs/source/_config.py']
|
| 359 |
+
|
| 360 |
+
Args:
|
| 361 |
+
pattern (str): Unix pattern or paths or URLs of the data files to resolve.
|
| 362 |
+
The paths can be absolute or relative to base_path.
|
| 363 |
+
Remote filesystems using fsspec are supported, e.g. with the hf:// protocol.
|
| 364 |
+
base_path (str): Base path to use when resolving relative paths.
|
| 365 |
+
allowed_extensions (Optional[list], optional): White-list of file extensions to use. Defaults to None (all extensions).
|
| 366 |
+
For example: allowed_extensions=[".csv", ".json", ".txt", ".parquet"]
|
| 367 |
+
download_config ([`DownloadConfig`], *optional*): Specific download configuration parameters.
|
| 368 |
+
Returns:
|
| 369 |
+
List[str]: List of paths or URLs to the local or remote files that match the patterns.
|
| 370 |
+
"""
|
| 371 |
+
if is_relative_path(pattern):
|
| 372 |
+
pattern = xjoin(base_path, pattern)
|
| 373 |
+
elif is_local_path(pattern):
|
| 374 |
+
base_path = os.path.splitdrive(pattern)[0] + os.sep
|
| 375 |
+
else:
|
| 376 |
+
base_path = ""
|
| 377 |
+
pattern, storage_options = _prepare_path_and_storage_options(pattern, download_config=download_config)
|
| 378 |
+
fs, fs_pattern = url_to_fs(pattern, **storage_options)
|
| 379 |
+
files_to_ignore = set(FILES_TO_IGNORE) - {xbasename(pattern)}
|
| 380 |
+
protocol = fs.protocol if isinstance(fs.protocol, str) else fs.protocol[0]
|
| 381 |
+
protocol_prefix = protocol + "://" if protocol != "file" else ""
|
| 382 |
+
glob_kwargs = {}
|
| 383 |
+
if protocol == "hf" and config.HF_HUB_VERSION >= version.parse("0.20.0"):
|
| 384 |
+
# 10 times faster glob with detail=True (ignores costly info like lastCommit)
|
| 385 |
+
glob_kwargs["expand_info"] = False
|
| 386 |
+
matched_paths = [
|
| 387 |
+
filepath if filepath.startswith(protocol_prefix) else protocol_prefix + filepath
|
| 388 |
+
for filepath, info in fs.glob(pattern, detail=True, **glob_kwargs).items()
|
| 389 |
+
if info["type"] == "file"
|
| 390 |
+
and (xbasename(filepath) not in files_to_ignore)
|
| 391 |
+
and not _is_inside_unrequested_special_dir(filepath, fs_pattern)
|
| 392 |
+
and not _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(filepath, fs_pattern)
|
| 393 |
+
] # ignore .ipynb and __pycache__, but keep /../
|
| 394 |
+
if allowed_extensions is not None:
|
| 395 |
+
out = [
|
| 396 |
+
filepath
|
| 397 |
+
for filepath in matched_paths
|
| 398 |
+
if any("." + suffix in allowed_extensions for suffix in xbasename(filepath).split(".")[1:])
|
| 399 |
+
]
|
| 400 |
+
if len(out) < len(matched_paths):
|
| 401 |
+
invalid_matched_files = list(set(matched_paths) - set(out))
|
| 402 |
+
logger.info(
|
| 403 |
+
f"Some files matched the pattern '{pattern}' but don't have valid data file extensions: {invalid_matched_files}"
|
| 404 |
+
)
|
| 405 |
+
else:
|
| 406 |
+
out = matched_paths
|
| 407 |
+
if not out:
|
| 408 |
+
error_msg = f"Unable to find '{pattern}'"
|
| 409 |
+
if allowed_extensions is not None:
|
| 410 |
+
error_msg += f" with any supported extension {list(allowed_extensions)}"
|
| 411 |
+
raise FileNotFoundError(error_msg)
|
| 412 |
+
return out
|
| 413 |
+
|
| 414 |
+
|
| 415 |
+
def get_data_patterns(base_path: str, download_config: Optional[DownloadConfig] = None) -> Dict[str, List[str]]:
|
| 416 |
+
"""
|
| 417 |
+
Get the default pattern from a directory testing all the supported patterns.
|
| 418 |
+
The first patterns to return a non-empty list of data files is returned.
|
| 419 |
+
|
| 420 |
+
Some examples of supported patterns:
|
| 421 |
+
|
| 422 |
+
Input:
|
| 423 |
+
|
| 424 |
+
my_dataset_repository/
|
| 425 |
+
├── README.md
|
| 426 |
+
└── dataset.csv
|
| 427 |
+
|
| 428 |
+
Output:
|
| 429 |
+
|
| 430 |
+
{'train': ['**']}
|
| 431 |
+
|
| 432 |
+
Input:
|
| 433 |
+
|
| 434 |
+
my_dataset_repository/
|
| 435 |
+
├── README.md
|
| 436 |
+
├── train.csv
|
| 437 |
+
└── test.csv
|
| 438 |
+
|
| 439 |
+
my_dataset_repository/
|
| 440 |
+
├── README.md
|
| 441 |
+
└── data/
|
| 442 |
+
├── train.csv
|
| 443 |
+
└── test.csv
|
| 444 |
+
|
| 445 |
+
my_dataset_repository/
|
| 446 |
+
├── README.md
|
| 447 |
+
├── train_0.csv
|
| 448 |
+
├── train_1.csv
|
| 449 |
+
├── train_2.csv
|
| 450 |
+
├── train_3.csv
|
| 451 |
+
├── test_0.csv
|
| 452 |
+
└── test_1.csv
|
| 453 |
+
|
| 454 |
+
Output:
|
| 455 |
+
|
| 456 |
+
{'train': ['**/train[-._ 0-9]*', '**/*[-._ 0-9]train[-._ 0-9]*', '**/training[-._ 0-9]*', '**/*[-._ 0-9]training[-._ 0-9]*'],
|
| 457 |
+
'test': ['**/test[-._ 0-9]*', '**/*[-._ 0-9]test[-._ 0-9]*', '**/testing[-._ 0-9]*', '**/*[-._ 0-9]testing[-._ 0-9]*', ...]}
|
| 458 |
+
|
| 459 |
+
Input:
|
| 460 |
+
|
| 461 |
+
my_dataset_repository/
|
| 462 |
+
├── README.md
|
| 463 |
+
└── data/
|
| 464 |
+
├── train/
|
| 465 |
+
│ ├── shard_0.csv
|
| 466 |
+
│ ├── shard_1.csv
|
| 467 |
+
│ ├── shard_2.csv
|
| 468 |
+
│ └── shard_3.csv
|
| 469 |
+
└── test/
|
| 470 |
+
├── shard_0.csv
|
| 471 |
+
└── shard_1.csv
|
| 472 |
+
|
| 473 |
+
Output:
|
| 474 |
+
|
| 475 |
+
{'train': ['**/train/**', '**/train[-._ 0-9]*/**', '**/*[-._ 0-9]train/**', '**/*[-._ 0-9]train[-._ 0-9]*/**', ...],
|
| 476 |
+
'test': ['**/test/**', '**/test[-._ 0-9]*/**', '**/*[-._ 0-9]test/**', '**/*[-._ 0-9]test[-._ 0-9]*/**', ...]}
|
| 477 |
+
|
| 478 |
+
Input:
|
| 479 |
+
|
| 480 |
+
my_dataset_repository/
|
| 481 |
+
├── README.md
|
| 482 |
+
└── data/
|
| 483 |
+
├── train-00000-of-00003.csv
|
| 484 |
+
├── train-00001-of-00003.csv
|
| 485 |
+
├── train-00002-of-00003.csv
|
| 486 |
+
├── test-00000-of-00001.csv
|
| 487 |
+
├── random-00000-of-00003.csv
|
| 488 |
+
├── random-00001-of-00003.csv
|
| 489 |
+
└── random-00002-of-00003.csv
|
| 490 |
+
|
| 491 |
+
Output:
|
| 492 |
+
|
| 493 |
+
{'train': ['data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
|
| 494 |
+
'test': ['data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
|
| 495 |
+
'random': ['data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*']}
|
| 496 |
+
|
| 497 |
+
In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
|
| 498 |
+
"""
|
| 499 |
+
resolver = partial(resolve_pattern, base_path=base_path, download_config=download_config)
|
| 500 |
+
try:
|
| 501 |
+
return _get_data_files_patterns(resolver)
|
| 502 |
+
except FileNotFoundError:
|
| 503 |
+
raise EmptyDatasetError(f"The directory at {base_path} doesn't contain any data files") from None
|
| 504 |
+
|
| 505 |
+
|
| 506 |
+
def get_metadata_patterns(
|
| 507 |
+
base_path: str,
|
| 508 |
+
download_config: Optional[DownloadConfig] = None,
|
| 509 |
+
) -> List[str]:
|
| 510 |
+
"""
|
| 511 |
+
Get the supported metadata patterns from a local directory.
|
| 512 |
+
"""
|
| 513 |
+
resolver = partial(resolve_pattern, base_path=base_path, download_config=download_config)
|
| 514 |
+
try:
|
| 515 |
+
return _get_metadata_files_patterns(resolver)
|
| 516 |
+
except FileNotFoundError:
|
| 517 |
+
raise FileNotFoundError(f"The directory at {base_path} doesn't contain any metadata file") from None
|
| 518 |
+
|
| 519 |
+
|
| 520 |
+
def _get_single_origin_metadata(
|
| 521 |
+
data_file: str,
|
| 522 |
+
download_config: Optional[DownloadConfig] = None,
|
| 523 |
+
) -> SingleOriginMetadata:
|
| 524 |
+
data_file, storage_options = _prepare_path_and_storage_options(data_file, download_config=download_config)
|
| 525 |
+
fs, *_ = url_to_fs(data_file, **storage_options)
|
| 526 |
+
if isinstance(fs, HfFileSystem):
|
| 527 |
+
resolved_path = fs.resolve_path(data_file)
|
| 528 |
+
return resolved_path.repo_id, resolved_path.revision
|
| 529 |
+
elif isinstance(fs, HTTPFileSystem) and data_file.startswith(config.HF_ENDPOINT):
|
| 530 |
+
hffs = HfFileSystem(endpoint=config.HF_ENDPOINT, token=download_config.token)
|
| 531 |
+
data_file = "hf://" + data_file[len(config.HF_ENDPOINT) + 1 :].replace("/resolve/", "@", 1)
|
| 532 |
+
resolved_path = hffs.resolve_path(data_file)
|
| 533 |
+
return resolved_path.repo_id, resolved_path.revision
|
| 534 |
+
info = fs.info(data_file)
|
| 535 |
+
# s3fs uses "ETag", gcsfs uses "etag", and for local we simply check mtime
|
| 536 |
+
for key in ["ETag", "etag", "mtime"]:
|
| 537 |
+
if key in info:
|
| 538 |
+
return (str(info[key]),)
|
| 539 |
+
return ()
|
| 540 |
+
|
| 541 |
+
|
| 542 |
+
def _get_origin_metadata(
|
| 543 |
+
data_files: List[str],
|
| 544 |
+
download_config: Optional[DownloadConfig] = None,
|
| 545 |
+
max_workers: Optional[int] = None,
|
| 546 |
+
) -> List[SingleOriginMetadata]:
|
| 547 |
+
max_workers = max_workers if max_workers is not None else config.HF_DATASETS_MULTITHREADING_MAX_WORKERS
|
| 548 |
+
return thread_map(
|
| 549 |
+
partial(_get_single_origin_metadata, download_config=download_config),
|
| 550 |
+
data_files,
|
| 551 |
+
max_workers=max_workers,
|
| 552 |
+
tqdm_class=hf_tqdm,
|
| 553 |
+
desc="Resolving data files",
|
| 554 |
+
# set `disable=None` rather than `disable=False` by default to disable progress bar when no TTY attached
|
| 555 |
+
disable=len(data_files) <= 16 or None,
|
| 556 |
+
)
|
| 557 |
+
|
| 558 |
+
|
| 559 |
+
class DataFilesList(List[str]):
|
| 560 |
+
"""
|
| 561 |
+
List of data files (absolute local paths or URLs).
|
| 562 |
+
It has two construction methods given the user's data files patterns:
|
| 563 |
+
- ``from_hf_repo``: resolve patterns inside a dataset repository
|
| 564 |
+
- ``from_local_or_remote``: resolve patterns from a local path
|
| 565 |
+
|
| 566 |
+
Moreover, DataFilesList has an additional attribute ``origin_metadata``.
|
| 567 |
+
It can store:
|
| 568 |
+
- the last modified time of local files
|
| 569 |
+
- ETag of remote files
|
| 570 |
+
- commit sha of a dataset repository
|
| 571 |
+
|
| 572 |
+
Thanks to this additional attribute, it is possible to hash the list
|
| 573 |
+
and get a different hash if and only if at least one file changed.
|
| 574 |
+
This is useful for caching Dataset objects that are obtained from a list of data files.
|
| 575 |
+
"""
|
| 576 |
+
|
| 577 |
+
def __init__(self, data_files: List[str], origin_metadata: List[SingleOriginMetadata]) -> None:
|
| 578 |
+
super().__init__(data_files)
|
| 579 |
+
self.origin_metadata = origin_metadata
|
| 580 |
+
|
| 581 |
+
def __add__(self, other: "DataFilesList") -> "DataFilesList":
|
| 582 |
+
return DataFilesList([*self, *other], self.origin_metadata + other.origin_metadata)
|
| 583 |
+
|
| 584 |
+
@classmethod
|
| 585 |
+
def from_hf_repo(
|
| 586 |
+
cls,
|
| 587 |
+
patterns: List[str],
|
| 588 |
+
dataset_info: huggingface_hub.hf_api.DatasetInfo,
|
| 589 |
+
base_path: Optional[str] = None,
|
| 590 |
+
allowed_extensions: Optional[List[str]] = None,
|
| 591 |
+
download_config: Optional[DownloadConfig] = None,
|
| 592 |
+
) -> "DataFilesList":
|
| 593 |
+
base_path = f"hf://datasets/{dataset_info.id}@{dataset_info.sha}/{base_path or ''}".rstrip("/")
|
| 594 |
+
return cls.from_patterns(
|
| 595 |
+
patterns, base_path=base_path, allowed_extensions=allowed_extensions, download_config=download_config
|
| 596 |
+
)
|
| 597 |
+
|
| 598 |
+
@classmethod
|
| 599 |
+
def from_local_or_remote(
|
| 600 |
+
cls,
|
| 601 |
+
patterns: List[str],
|
| 602 |
+
base_path: Optional[str] = None,
|
| 603 |
+
allowed_extensions: Optional[List[str]] = None,
|
| 604 |
+
download_config: Optional[DownloadConfig] = None,
|
| 605 |
+
) -> "DataFilesList":
|
| 606 |
+
base_path = base_path if base_path is not None else Path().resolve().as_posix()
|
| 607 |
+
return cls.from_patterns(
|
| 608 |
+
patterns, base_path=base_path, allowed_extensions=allowed_extensions, download_config=download_config
|
| 609 |
+
)
|
| 610 |
+
|
| 611 |
+
@classmethod
|
| 612 |
+
def from_patterns(
|
| 613 |
+
cls,
|
| 614 |
+
patterns: List[str],
|
| 615 |
+
base_path: Optional[str] = None,
|
| 616 |
+
allowed_extensions: Optional[List[str]] = None,
|
| 617 |
+
download_config: Optional[DownloadConfig] = None,
|
| 618 |
+
) -> "DataFilesList":
|
| 619 |
+
base_path = base_path if base_path is not None else Path().resolve().as_posix()
|
| 620 |
+
data_files = []
|
| 621 |
+
for pattern in patterns:
|
| 622 |
+
try:
|
| 623 |
+
data_files.extend(
|
| 624 |
+
resolve_pattern(
|
| 625 |
+
pattern,
|
| 626 |
+
base_path=base_path,
|
| 627 |
+
allowed_extensions=allowed_extensions,
|
| 628 |
+
download_config=download_config,
|
| 629 |
+
)
|
| 630 |
+
)
|
| 631 |
+
except FileNotFoundError:
|
| 632 |
+
if not has_magic(pattern):
|
| 633 |
+
raise
|
| 634 |
+
origin_metadata = _get_origin_metadata(data_files, download_config=download_config)
|
| 635 |
+
return cls(data_files, origin_metadata)
|
| 636 |
+
|
| 637 |
+
def filter_extensions(self, extensions: List[str]) -> "DataFilesList":
|
| 638 |
+
pattern = "|".join("\\" + ext for ext in extensions)
|
| 639 |
+
pattern = re.compile(f".*({pattern})(\\..+)?$")
|
| 640 |
+
return DataFilesList(
|
| 641 |
+
[data_file for data_file in self if pattern.match(data_file)],
|
| 642 |
+
origin_metadata=self.origin_metadata,
|
| 643 |
+
)
|
| 644 |
+
|
| 645 |
+
|
| 646 |
+
class DataFilesDict(Dict[str, DataFilesList]):
|
| 647 |
+
"""
|
| 648 |
+
Dict of split_name -> list of data files (absolute local paths or URLs).
|
| 649 |
+
It has two construction methods given the user's data files patterns :
|
| 650 |
+
- ``from_hf_repo``: resolve patterns inside a dataset repository
|
| 651 |
+
- ``from_local_or_remote``: resolve patterns from a local path
|
| 652 |
+
|
| 653 |
+
Moreover, each list is a DataFilesList. It is possible to hash the dictionary
|
| 654 |
+
and get a different hash if and only if at least one file changed.
|
| 655 |
+
For more info, see [`DataFilesList`].
|
| 656 |
+
|
| 657 |
+
This is useful for caching Dataset objects that are obtained from a list of data files.
|
| 658 |
+
|
| 659 |
+
Changing the order of the keys of this dictionary also doesn't change its hash.
|
| 660 |
+
"""
|
| 661 |
+
|
| 662 |
+
@classmethod
|
| 663 |
+
def from_local_or_remote(
|
| 664 |
+
cls,
|
| 665 |
+
patterns: Dict[str, Union[List[str], DataFilesList]],
|
| 666 |
+
base_path: Optional[str] = None,
|
| 667 |
+
allowed_extensions: Optional[List[str]] = None,
|
| 668 |
+
download_config: Optional[DownloadConfig] = None,
|
| 669 |
+
) -> "DataFilesDict":
|
| 670 |
+
out = cls()
|
| 671 |
+
for key, patterns_for_key in patterns.items():
|
| 672 |
+
out[key] = (
|
| 673 |
+
patterns_for_key
|
| 674 |
+
if isinstance(patterns_for_key, DataFilesList)
|
| 675 |
+
else DataFilesList.from_local_or_remote(
|
| 676 |
+
patterns_for_key,
|
| 677 |
+
base_path=base_path,
|
| 678 |
+
allowed_extensions=allowed_extensions,
|
| 679 |
+
download_config=download_config,
|
| 680 |
+
)
|
| 681 |
+
)
|
| 682 |
+
return out
|
| 683 |
+
|
| 684 |
+
@classmethod
|
| 685 |
+
def from_hf_repo(
|
| 686 |
+
cls,
|
| 687 |
+
patterns: Dict[str, Union[List[str], DataFilesList]],
|
| 688 |
+
dataset_info: huggingface_hub.hf_api.DatasetInfo,
|
| 689 |
+
base_path: Optional[str] = None,
|
| 690 |
+
allowed_extensions: Optional[List[str]] = None,
|
| 691 |
+
download_config: Optional[DownloadConfig] = None,
|
| 692 |
+
) -> "DataFilesDict":
|
| 693 |
+
out = cls()
|
| 694 |
+
for key, patterns_for_key in patterns.items():
|
| 695 |
+
out[key] = (
|
| 696 |
+
patterns_for_key
|
| 697 |
+
if isinstance(patterns_for_key, DataFilesList)
|
| 698 |
+
else DataFilesList.from_hf_repo(
|
| 699 |
+
patterns_for_key,
|
| 700 |
+
dataset_info=dataset_info,
|
| 701 |
+
base_path=base_path,
|
| 702 |
+
allowed_extensions=allowed_extensions,
|
| 703 |
+
download_config=download_config,
|
| 704 |
+
)
|
| 705 |
+
)
|
| 706 |
+
return out
|
| 707 |
+
|
| 708 |
+
@classmethod
|
| 709 |
+
def from_patterns(
|
| 710 |
+
cls,
|
| 711 |
+
patterns: Dict[str, Union[List[str], DataFilesList]],
|
| 712 |
+
base_path: Optional[str] = None,
|
| 713 |
+
allowed_extensions: Optional[List[str]] = None,
|
| 714 |
+
download_config: Optional[DownloadConfig] = None,
|
| 715 |
+
) -> "DataFilesDict":
|
| 716 |
+
out = cls()
|
| 717 |
+
for key, patterns_for_key in patterns.items():
|
| 718 |
+
out[key] = (
|
| 719 |
+
patterns_for_key
|
| 720 |
+
if isinstance(patterns_for_key, DataFilesList)
|
| 721 |
+
else DataFilesList.from_patterns(
|
| 722 |
+
patterns_for_key,
|
| 723 |
+
base_path=base_path,
|
| 724 |
+
allowed_extensions=allowed_extensions,
|
| 725 |
+
download_config=download_config,
|
| 726 |
+
)
|
| 727 |
+
)
|
| 728 |
+
return out
|
| 729 |
+
|
| 730 |
+
def filter_extensions(self, extensions: List[str]) -> "DataFilesDict":
|
| 731 |
+
out = type(self)()
|
| 732 |
+
for key, data_files_list in self.items():
|
| 733 |
+
out[key] = data_files_list.filter_extensions(extensions)
|
| 734 |
+
return out
|
| 735 |
+
|
| 736 |
+
|
| 737 |
+
class DataFilesPatternsList(List[str]):
|
| 738 |
+
"""
|
| 739 |
+
List of data files patterns (absolute local paths or URLs).
|
| 740 |
+
For each pattern there should also be a list of allowed extensions
|
| 741 |
+
to keep, or a None ot keep all the files for the pattern.
|
| 742 |
+
"""
|
| 743 |
+
|
| 744 |
+
def __init__(
|
| 745 |
+
self,
|
| 746 |
+
patterns: List[str],
|
| 747 |
+
allowed_extensions: List[Optional[List[str]]],
|
| 748 |
+
):
|
| 749 |
+
super().__init__(patterns)
|
| 750 |
+
self.allowed_extensions = allowed_extensions
|
| 751 |
+
|
| 752 |
+
def __add__(self, other):
|
| 753 |
+
return DataFilesList([*self, *other], self.allowed_extensions + other.allowed_extensions)
|
| 754 |
+
|
| 755 |
+
@classmethod
|
| 756 |
+
def from_patterns(
|
| 757 |
+
cls, patterns: List[str], allowed_extensions: Optional[List[str]] = None
|
| 758 |
+
) -> "DataFilesPatternsList":
|
| 759 |
+
return cls(patterns, [allowed_extensions] * len(patterns))
|
| 760 |
+
|
| 761 |
+
def resolve(
|
| 762 |
+
self,
|
| 763 |
+
base_path: str,
|
| 764 |
+
download_config: Optional[DownloadConfig] = None,
|
| 765 |
+
) -> "DataFilesList":
|
| 766 |
+
base_path = base_path if base_path is not None else Path().resolve().as_posix()
|
| 767 |
+
data_files = []
|
| 768 |
+
for pattern, allowed_extensions in zip(self, self.allowed_extensions):
|
| 769 |
+
try:
|
| 770 |
+
data_files.extend(
|
| 771 |
+
resolve_pattern(
|
| 772 |
+
pattern,
|
| 773 |
+
base_path=base_path,
|
| 774 |
+
allowed_extensions=allowed_extensions,
|
| 775 |
+
download_config=download_config,
|
| 776 |
+
)
|
| 777 |
+
)
|
| 778 |
+
except FileNotFoundError:
|
| 779 |
+
if not has_magic(pattern):
|
| 780 |
+
raise
|
| 781 |
+
origin_metadata = _get_origin_metadata(data_files, download_config=download_config)
|
| 782 |
+
return DataFilesList(data_files, origin_metadata)
|
| 783 |
+
|
| 784 |
+
def filter_extensions(self, extensions: List[str]) -> "DataFilesPatternsList":
|
| 785 |
+
return DataFilesPatternsList(
|
| 786 |
+
self, [allowed_extensions + extensions for allowed_extensions in self.allowed_extensions]
|
| 787 |
+
)
|
| 788 |
+
|
| 789 |
+
|
| 790 |
+
class DataFilesPatternsDict(Dict[str, DataFilesPatternsList]):
|
| 791 |
+
"""
|
| 792 |
+
Dict of split_name -> list of data files patterns (absolute local paths or URLs).
|
| 793 |
+
"""
|
| 794 |
+
|
| 795 |
+
@classmethod
|
| 796 |
+
def from_patterns(
|
| 797 |
+
cls, patterns: Dict[str, List[str]], allowed_extensions: Optional[List[str]] = None
|
| 798 |
+
) -> "DataFilesPatternsDict":
|
| 799 |
+
out = cls()
|
| 800 |
+
for key, patterns_for_key in patterns.items():
|
| 801 |
+
out[key] = (
|
| 802 |
+
patterns_for_key
|
| 803 |
+
if isinstance(patterns_for_key, DataFilesPatternsList)
|
| 804 |
+
else DataFilesPatternsList.from_patterns(
|
| 805 |
+
patterns_for_key,
|
| 806 |
+
allowed_extensions=allowed_extensions,
|
| 807 |
+
)
|
| 808 |
+
)
|
| 809 |
+
return out
|
| 810 |
+
|
| 811 |
+
def resolve(
|
| 812 |
+
self,
|
| 813 |
+
base_path: str,
|
| 814 |
+
download_config: Optional[DownloadConfig] = None,
|
| 815 |
+
) -> "DataFilesDict":
|
| 816 |
+
out = DataFilesDict()
|
| 817 |
+
for key, data_files_patterns_list in self.items():
|
| 818 |
+
out[key] = data_files_patterns_list.resolve(base_path, download_config)
|
| 819 |
+
return out
|
| 820 |
+
|
| 821 |
+
def filter_extensions(self, extensions: List[str]) -> "DataFilesPatternsDict":
|
| 822 |
+
out = type(self)()
|
| 823 |
+
for key, data_files_patterns_list in self.items():
|
| 824 |
+
out[key] = data_files_patterns_list.filter_extensions(extensions)
|
| 825 |
+
return out
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/fingerprint.py
ADDED
|
@@ -0,0 +1,494 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import inspect
|
| 2 |
+
import os
|
| 3 |
+
import random
|
| 4 |
+
import shutil
|
| 5 |
+
import tempfile
|
| 6 |
+
import weakref
|
| 7 |
+
from functools import wraps
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
|
| 10 |
+
|
| 11 |
+
import numpy as np
|
| 12 |
+
import xxhash
|
| 13 |
+
|
| 14 |
+
from . import config
|
| 15 |
+
from .naming import INVALID_WINDOWS_CHARACTERS_IN_PATH
|
| 16 |
+
from .utils._dill import dumps
|
| 17 |
+
from .utils.deprecation_utils import deprecated
|
| 18 |
+
from .utils.logging import get_logger
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
if TYPE_CHECKING:
|
| 22 |
+
from .arrow_dataset import Dataset
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
logger = get_logger(__name__)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
# Fingerprinting allows to have one deterministic fingerprint per dataset state.
|
| 29 |
+
# A dataset fingerprint is updated after each transform.
|
| 30 |
+
# Re-running the same transforms on a dataset in a different session results in the same fingerprint.
|
| 31 |
+
# This is possible thanks to a custom hashing function that works with most python objects.
|
| 32 |
+
|
| 33 |
+
# Fingerprinting is the main mechanism that enables caching.
|
| 34 |
+
# The caching mechanism allows to reload an existing cache file if it's already been computed.
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
#################
|
| 38 |
+
# Caching
|
| 39 |
+
#################
|
| 40 |
+
|
| 41 |
+
_CACHING_ENABLED = True
|
| 42 |
+
_TEMP_DIR_FOR_TEMP_CACHE_FILES: Optional["_TempCacheDir"] = None
|
| 43 |
+
_DATASETS_WITH_TABLE_IN_TEMP_DIR: Optional[weakref.WeakSet] = None
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class _TempCacheDir:
|
| 47 |
+
"""
|
| 48 |
+
A temporary directory for storing cached Arrow files with a cleanup that frees references to the Arrow files
|
| 49 |
+
before deleting the directory itself to avoid permission errors on Windows.
|
| 50 |
+
"""
|
| 51 |
+
|
| 52 |
+
def __init__(self):
|
| 53 |
+
self.name = tempfile.mkdtemp(prefix=config.TEMP_CACHE_DIR_PREFIX)
|
| 54 |
+
self._finalizer = weakref.finalize(self, self._cleanup)
|
| 55 |
+
|
| 56 |
+
def _cleanup(self):
|
| 57 |
+
for dset in get_datasets_with_cache_file_in_temp_dir():
|
| 58 |
+
dset.__del__()
|
| 59 |
+
if os.path.exists(self.name):
|
| 60 |
+
try:
|
| 61 |
+
shutil.rmtree(self.name)
|
| 62 |
+
except Exception as e:
|
| 63 |
+
raise OSError(
|
| 64 |
+
f"An error occured while trying to delete temporary cache directory {self.name}. Please delete it manually."
|
| 65 |
+
) from e
|
| 66 |
+
|
| 67 |
+
def cleanup(self):
|
| 68 |
+
if self._finalizer.detach():
|
| 69 |
+
self._cleanup()
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def maybe_register_dataset_for_temp_dir_deletion(dataset):
|
| 73 |
+
"""
|
| 74 |
+
This function registers the datasets that have cache files in _TEMP_DIR_FOR_TEMP_CACHE_FILES in order
|
| 75 |
+
to properly delete them before deleting the temporary directory.
|
| 76 |
+
The temporary directory _TEMP_DIR_FOR_TEMP_CACHE_FILES is used when caching is disabled.
|
| 77 |
+
"""
|
| 78 |
+
if _TEMP_DIR_FOR_TEMP_CACHE_FILES is None:
|
| 79 |
+
return
|
| 80 |
+
|
| 81 |
+
global _DATASETS_WITH_TABLE_IN_TEMP_DIR
|
| 82 |
+
if _DATASETS_WITH_TABLE_IN_TEMP_DIR is None:
|
| 83 |
+
_DATASETS_WITH_TABLE_IN_TEMP_DIR = weakref.WeakSet()
|
| 84 |
+
if any(
|
| 85 |
+
Path(_TEMP_DIR_FOR_TEMP_CACHE_FILES.name) in Path(cache_file["filename"]).parents
|
| 86 |
+
for cache_file in dataset.cache_files
|
| 87 |
+
):
|
| 88 |
+
_DATASETS_WITH_TABLE_IN_TEMP_DIR.add(dataset)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def get_datasets_with_cache_file_in_temp_dir():
|
| 92 |
+
return list(_DATASETS_WITH_TABLE_IN_TEMP_DIR) if _DATASETS_WITH_TABLE_IN_TEMP_DIR is not None else []
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def enable_caching():
|
| 96 |
+
"""
|
| 97 |
+
When applying transforms on a dataset, the data are stored in cache files.
|
| 98 |
+
The caching mechanism allows to reload an existing cache file if it's already been computed.
|
| 99 |
+
|
| 100 |
+
Reloading a dataset is possible since the cache files are named using the dataset fingerprint, which is updated
|
| 101 |
+
after each transform.
|
| 102 |
+
|
| 103 |
+
If disabled, the library will no longer reload cached datasets files when applying transforms to the datasets.
|
| 104 |
+
More precisely, if the caching is disabled:
|
| 105 |
+
- cache files are always recreated
|
| 106 |
+
- cache files are written to a temporary directory that is deleted when session closes
|
| 107 |
+
- cache files are named using a random hash instead of the dataset fingerprint
|
| 108 |
+
- use [`~datasets.Dataset.save_to_disk`] to save a transformed dataset or it will be deleted when session closes
|
| 109 |
+
- caching doesn't affect [`~datasets.load_dataset`]. If you want to regenerate a dataset from scratch you should use
|
| 110 |
+
the `download_mode` parameter in [`~datasets.load_dataset`].
|
| 111 |
+
"""
|
| 112 |
+
global _CACHING_ENABLED
|
| 113 |
+
_CACHING_ENABLED = True
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def disable_caching():
|
| 117 |
+
"""
|
| 118 |
+
When applying transforms on a dataset, the data are stored in cache files.
|
| 119 |
+
The caching mechanism allows to reload an existing cache file if it's already been computed.
|
| 120 |
+
|
| 121 |
+
Reloading a dataset is possible since the cache files are named using the dataset fingerprint, which is updated
|
| 122 |
+
after each transform.
|
| 123 |
+
|
| 124 |
+
If disabled, the library will no longer reload cached datasets files when applying transforms to the datasets.
|
| 125 |
+
More precisely, if the caching is disabled:
|
| 126 |
+
- cache files are always recreated
|
| 127 |
+
- cache files are written to a temporary directory that is deleted when session closes
|
| 128 |
+
- cache files are named using a random hash instead of the dataset fingerprint
|
| 129 |
+
- use [`~datasets.Dataset.save_to_disk`] to save a transformed dataset or it will be deleted when session closes
|
| 130 |
+
- caching doesn't affect [`~datasets.load_dataset`]. If you want to regenerate a dataset from scratch you should use
|
| 131 |
+
the `download_mode` parameter in [`~datasets.load_dataset`].
|
| 132 |
+
"""
|
| 133 |
+
global _CACHING_ENABLED
|
| 134 |
+
_CACHING_ENABLED = False
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
@deprecated(
|
| 138 |
+
"Use datasets.enable_caching() or datasets.disable_caching() instead. This function will be removed in a future version of datasets."
|
| 139 |
+
)
|
| 140 |
+
def set_caching_enabled(boolean: bool):
|
| 141 |
+
"""
|
| 142 |
+
When applying transforms on a dataset, the data are stored in cache files.
|
| 143 |
+
The caching mechanism allows to reload an existing cache file if it's already been computed.
|
| 144 |
+
|
| 145 |
+
Reloading a dataset is possible since the cache files are named using the dataset fingerprint, which is updated
|
| 146 |
+
after each transform.
|
| 147 |
+
|
| 148 |
+
If disabled, the library will no longer reload cached datasets files when applying transforms to the datasets.
|
| 149 |
+
More precisely, if the caching is disabled:
|
| 150 |
+
- cache files are always recreated
|
| 151 |
+
- cache files are written to a temporary directory that is deleted when session closes
|
| 152 |
+
- cache files are named using a random hash instead of the dataset fingerprint
|
| 153 |
+
- use :func:`datasets.Dataset.save_to_disk` to save a transformed dataset or it will be deleted when session closes
|
| 154 |
+
- caching doesn't affect :func:`datasets.load_dataset`. If you want to regenerate a dataset from scratch you should use
|
| 155 |
+
the ``download_mode`` parameter in :func:`datasets.load_dataset`.
|
| 156 |
+
"""
|
| 157 |
+
global _CACHING_ENABLED
|
| 158 |
+
_CACHING_ENABLED = bool(boolean)
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def is_caching_enabled() -> bool:
|
| 162 |
+
"""
|
| 163 |
+
When applying transforms on a dataset, the data are stored in cache files.
|
| 164 |
+
The caching mechanism allows to reload an existing cache file if it's already been computed.
|
| 165 |
+
|
| 166 |
+
Reloading a dataset is possible since the cache files are named using the dataset fingerprint, which is updated
|
| 167 |
+
after each transform.
|
| 168 |
+
|
| 169 |
+
If disabled, the library will no longer reload cached datasets files when applying transforms to the datasets.
|
| 170 |
+
More precisely, if the caching is disabled:
|
| 171 |
+
- cache files are always recreated
|
| 172 |
+
- cache files are written to a temporary directory that is deleted when session closes
|
| 173 |
+
- cache files are named using a random hash instead of the dataset fingerprint
|
| 174 |
+
- use [`~datasets.Dataset.save_to_disk`]] to save a transformed dataset or it will be deleted when session closes
|
| 175 |
+
- caching doesn't affect [`~datasets.load_dataset`]. If you want to regenerate a dataset from scratch you should use
|
| 176 |
+
the `download_mode` parameter in [`~datasets.load_dataset`].
|
| 177 |
+
"""
|
| 178 |
+
global _CACHING_ENABLED
|
| 179 |
+
return bool(_CACHING_ENABLED)
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
def get_temporary_cache_files_directory() -> str:
|
| 183 |
+
"""Return a directory that is deleted when session closes."""
|
| 184 |
+
global _TEMP_DIR_FOR_TEMP_CACHE_FILES
|
| 185 |
+
if _TEMP_DIR_FOR_TEMP_CACHE_FILES is None:
|
| 186 |
+
_TEMP_DIR_FOR_TEMP_CACHE_FILES = _TempCacheDir()
|
| 187 |
+
return _TEMP_DIR_FOR_TEMP_CACHE_FILES.name
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
#################
|
| 191 |
+
# Hashing
|
| 192 |
+
#################
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
@deprecated("Use `copyreg.pickle` to register a custom reducer.")
|
| 196 |
+
def hashregister(*types):
|
| 197 |
+
def proxy(func):
|
| 198 |
+
for t in types:
|
| 199 |
+
Hasher.dispatch[t] = func
|
| 200 |
+
return func
|
| 201 |
+
|
| 202 |
+
return proxy
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
class Hasher:
|
| 206 |
+
"""Hasher that accepts python objects as inputs."""
|
| 207 |
+
|
| 208 |
+
dispatch: Dict = {}
|
| 209 |
+
|
| 210 |
+
def __init__(self):
|
| 211 |
+
self.m = xxhash.xxh64()
|
| 212 |
+
|
| 213 |
+
@classmethod
|
| 214 |
+
def hash_bytes(cls, value: Union[bytes, List[bytes]]) -> str:
|
| 215 |
+
value = [value] if isinstance(value, bytes) else value
|
| 216 |
+
m = xxhash.xxh64()
|
| 217 |
+
for x in value:
|
| 218 |
+
m.update(x)
|
| 219 |
+
return m.hexdigest()
|
| 220 |
+
|
| 221 |
+
@classmethod
|
| 222 |
+
@deprecated("Use `Hasher.hash` instead.")
|
| 223 |
+
def hash_default(cls, value: Any) -> str:
|
| 224 |
+
return cls.hash(value)
|
| 225 |
+
|
| 226 |
+
@classmethod
|
| 227 |
+
def hash(cls, value: Any) -> str:
|
| 228 |
+
return cls.hash_bytes(dumps(value))
|
| 229 |
+
|
| 230 |
+
def update(self, value: Any) -> None:
|
| 231 |
+
header_for_update = f"=={type(value)}=="
|
| 232 |
+
value_for_update = self.hash(value)
|
| 233 |
+
self.m.update(header_for_update.encode("utf8"))
|
| 234 |
+
self.m.update(value_for_update.encode("utf-8"))
|
| 235 |
+
|
| 236 |
+
def hexdigest(self) -> str:
|
| 237 |
+
return self.m.hexdigest()
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
#################
|
| 241 |
+
# Fingerprinting
|
| 242 |
+
#################
|
| 243 |
+
|
| 244 |
+
fingerprint_rng = random.Random()
|
| 245 |
+
# we show a warning only once when fingerprinting fails to avoid spam
|
| 246 |
+
fingerprint_warnings: Dict[str, bool] = {}
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
def generate_fingerprint(dataset: "Dataset") -> str:
|
| 250 |
+
state = dataset.__dict__
|
| 251 |
+
hasher = Hasher()
|
| 252 |
+
for key in sorted(state):
|
| 253 |
+
if key == "_fingerprint":
|
| 254 |
+
continue
|
| 255 |
+
hasher.update(key)
|
| 256 |
+
hasher.update(state[key])
|
| 257 |
+
# hash data files last modification timestamps as well
|
| 258 |
+
for cache_file in dataset.cache_files:
|
| 259 |
+
hasher.update(os.path.getmtime(cache_file["filename"]))
|
| 260 |
+
return hasher.hexdigest()
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
def generate_random_fingerprint(nbits: int = 64) -> str:
|
| 264 |
+
return f"{fingerprint_rng.getrandbits(nbits):0{nbits//4}x}"
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
def update_fingerprint(fingerprint, transform, transform_args):
|
| 268 |
+
global fingerprint_warnings
|
| 269 |
+
hasher = Hasher()
|
| 270 |
+
hasher.update(fingerprint)
|
| 271 |
+
try:
|
| 272 |
+
hasher.update(transform)
|
| 273 |
+
except: # noqa various errors might raise here from pickle or dill
|
| 274 |
+
if _CACHING_ENABLED:
|
| 275 |
+
if not fingerprint_warnings.get("update_fingerprint_transform_hash_failed", False):
|
| 276 |
+
logger.warning(
|
| 277 |
+
f"Transform {transform} couldn't be hashed properly, a random hash was used instead. "
|
| 278 |
+
"Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. "
|
| 279 |
+
"If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. "
|
| 280 |
+
"This warning is only showed once. Subsequent hashing failures won't be showed."
|
| 281 |
+
)
|
| 282 |
+
fingerprint_warnings["update_fingerprint_transform_hash_failed"] = True
|
| 283 |
+
else:
|
| 284 |
+
logger.info(f"Transform {transform} couldn't be hashed properly, a random hash was used instead.")
|
| 285 |
+
else:
|
| 286 |
+
logger.info(
|
| 287 |
+
f"Transform {transform} couldn't be hashed properly, a random hash was used instead. This doesn't affect caching since it's disabled."
|
| 288 |
+
)
|
| 289 |
+
|
| 290 |
+
return generate_random_fingerprint()
|
| 291 |
+
for key in sorted(transform_args):
|
| 292 |
+
hasher.update(key)
|
| 293 |
+
try:
|
| 294 |
+
hasher.update(transform_args[key])
|
| 295 |
+
except: # noqa various errors might raise here from pickle or dill
|
| 296 |
+
if _CACHING_ENABLED:
|
| 297 |
+
if not fingerprint_warnings.get("update_fingerprint_transform_hash_failed", False):
|
| 298 |
+
logger.warning(
|
| 299 |
+
f"Parameter '{key}'={transform_args[key]} of the transform {transform} couldn't be hashed properly, a random hash was used instead. "
|
| 300 |
+
"Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. "
|
| 301 |
+
"If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. "
|
| 302 |
+
"This warning is only showed once. Subsequent hashing failures won't be showed."
|
| 303 |
+
)
|
| 304 |
+
fingerprint_warnings["update_fingerprint_transform_hash_failed"] = True
|
| 305 |
+
else:
|
| 306 |
+
logger.info(
|
| 307 |
+
f"Parameter '{key}'={transform_args[key]} of the transform {transform} couldn't be hashed properly, a random hash was used instead."
|
| 308 |
+
)
|
| 309 |
+
else:
|
| 310 |
+
logger.info(
|
| 311 |
+
f"Parameter '{key}'={transform_args[key]} of the transform {transform} couldn't be hashed properly, a random hash was used instead. This doesn't affect caching since it's disabled."
|
| 312 |
+
)
|
| 313 |
+
return generate_random_fingerprint()
|
| 314 |
+
return hasher.hexdigest()
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
def validate_fingerprint(fingerprint: str, max_length=64):
|
| 318 |
+
"""
|
| 319 |
+
Make sure the fingerprint is a non-empty string that is not longer that max_length=64 by default,
|
| 320 |
+
so that the fingerprint can be used to name cache files without issues.
|
| 321 |
+
"""
|
| 322 |
+
if not isinstance(fingerprint, str) or not fingerprint:
|
| 323 |
+
raise ValueError(f"Invalid fingerprint '{fingerprint}': it should be a non-empty string.")
|
| 324 |
+
for invalid_char in INVALID_WINDOWS_CHARACTERS_IN_PATH:
|
| 325 |
+
if invalid_char in fingerprint:
|
| 326 |
+
raise ValueError(
|
| 327 |
+
f"Invalid fingerprint. Bad characters from black list '{INVALID_WINDOWS_CHARACTERS_IN_PATH}' found in '{fingerprint}'. "
|
| 328 |
+
f"They could create issues when creating cache files."
|
| 329 |
+
)
|
| 330 |
+
if len(fingerprint) > max_length:
|
| 331 |
+
raise ValueError(
|
| 332 |
+
f"Invalid fingerprint. Maximum lenth is {max_length} but '{fingerprint}' has length {len(fingerprint)}."
|
| 333 |
+
"It could create issues when creating cache files."
|
| 334 |
+
)
|
| 335 |
+
|
| 336 |
+
|
| 337 |
+
def format_transform_for_fingerprint(func: Callable, version: Optional[str] = None) -> str:
|
| 338 |
+
"""
|
| 339 |
+
Format a transform to the format that will be used to update the fingerprint.
|
| 340 |
+
"""
|
| 341 |
+
transform = f"{func.__module__}.{func.__qualname__}"
|
| 342 |
+
if version is not None:
|
| 343 |
+
transform += f"@{version}"
|
| 344 |
+
return transform
|
| 345 |
+
|
| 346 |
+
|
| 347 |
+
def format_kwargs_for_fingerprint(
|
| 348 |
+
func: Callable,
|
| 349 |
+
args: Tuple,
|
| 350 |
+
kwargs: Dict[str, Any],
|
| 351 |
+
use_kwargs: Optional[List[str]] = None,
|
| 352 |
+
ignore_kwargs: Optional[List[str]] = None,
|
| 353 |
+
randomized_function: bool = False,
|
| 354 |
+
) -> Dict[str, Any]:
|
| 355 |
+
"""
|
| 356 |
+
Format the kwargs of a transform to the format that will be used to update the fingerprint.
|
| 357 |
+
"""
|
| 358 |
+
kwargs_for_fingerprint = kwargs.copy()
|
| 359 |
+
if args:
|
| 360 |
+
params = [p.name for p in inspect.signature(func).parameters.values() if p != p.VAR_KEYWORD]
|
| 361 |
+
args = args[1:] # assume the first argument is the dataset
|
| 362 |
+
params = params[1:]
|
| 363 |
+
kwargs_for_fingerprint.update(zip(params, args))
|
| 364 |
+
else:
|
| 365 |
+
del kwargs_for_fingerprint[
|
| 366 |
+
next(iter(inspect.signature(func).parameters))
|
| 367 |
+
] # assume the first key is the dataset
|
| 368 |
+
|
| 369 |
+
# keep the right kwargs to be hashed to generate the fingerprint
|
| 370 |
+
|
| 371 |
+
if use_kwargs:
|
| 372 |
+
kwargs_for_fingerprint = {k: v for k, v in kwargs_for_fingerprint.items() if k in use_kwargs}
|
| 373 |
+
if ignore_kwargs:
|
| 374 |
+
kwargs_for_fingerprint = {k: v for k, v in kwargs_for_fingerprint.items() if k not in ignore_kwargs}
|
| 375 |
+
if randomized_function: # randomized functions have `seed` and `generator` parameters
|
| 376 |
+
if kwargs_for_fingerprint.get("seed") is None and kwargs_for_fingerprint.get("generator") is None:
|
| 377 |
+
_, seed, pos, *_ = np.random.get_state()
|
| 378 |
+
seed = seed[pos] if pos < 624 else seed[0]
|
| 379 |
+
kwargs_for_fingerprint["generator"] = np.random.default_rng(seed)
|
| 380 |
+
|
| 381 |
+
# remove kwargs that are the default values
|
| 382 |
+
|
| 383 |
+
default_values = {
|
| 384 |
+
p.name: p.default for p in inspect.signature(func).parameters.values() if p.default != inspect._empty
|
| 385 |
+
}
|
| 386 |
+
for default_varname, default_value in default_values.items():
|
| 387 |
+
if default_varname in kwargs_for_fingerprint and kwargs_for_fingerprint[default_varname] == default_value:
|
| 388 |
+
kwargs_for_fingerprint.pop(default_varname)
|
| 389 |
+
return kwargs_for_fingerprint
|
| 390 |
+
|
| 391 |
+
|
| 392 |
+
def fingerprint_transform(
|
| 393 |
+
inplace: bool,
|
| 394 |
+
use_kwargs: Optional[List[str]] = None,
|
| 395 |
+
ignore_kwargs: Optional[List[str]] = None,
|
| 396 |
+
fingerprint_names: Optional[List[str]] = None,
|
| 397 |
+
randomized_function: bool = False,
|
| 398 |
+
version: Optional[str] = None,
|
| 399 |
+
):
|
| 400 |
+
"""
|
| 401 |
+
Wrapper for dataset transforms to update the dataset fingerprint using ``update_fingerprint``
|
| 402 |
+
Args:
|
| 403 |
+
inplace (:obj:`bool`): If inplace is True, the fingerprint of the dataset is updated inplace.
|
| 404 |
+
Otherwise, a parameter "new_fingerprint" is passed to the wrapped method that should take care of
|
| 405 |
+
setting the fingerprint of the returned Dataset.
|
| 406 |
+
use_kwargs (:obj:`List[str]`, optional): optional white list of argument names to take into account
|
| 407 |
+
to update the fingerprint to the wrapped method that should take care of
|
| 408 |
+
setting the fingerprint of the returned Dataset. By default all the arguments are used.
|
| 409 |
+
ignore_kwargs (:obj:`List[str]`, optional): optional black list of argument names to take into account
|
| 410 |
+
to update the fingerprint. Note that ignore_kwargs prevails on use_kwargs.
|
| 411 |
+
fingerprint_names (:obj:`List[str]`, optional, defaults to ["new_fingerprint"]):
|
| 412 |
+
If the dataset transforms is not inplace and returns a DatasetDict, then it can require
|
| 413 |
+
several fingerprints (one per dataset in the DatasetDict). By specifying fingerprint_names,
|
| 414 |
+
one fingerprint named after each element of fingerprint_names is going to be passed.
|
| 415 |
+
randomized_function (:obj:`bool`, defaults to False): If the dataset transform is random and has
|
| 416 |
+
optional parameters "seed" and "generator", then you can set randomized_function to True.
|
| 417 |
+
This way, even if users set "seed" and "generator" to None, then the fingerprint is
|
| 418 |
+
going to be randomly generated depending on numpy's current state. In this case, the
|
| 419 |
+
generator is set to np.random.default_rng(np.random.get_state()[1][0]).
|
| 420 |
+
version (:obj:`str`, optional): version of the transform. The version is taken into account when
|
| 421 |
+
computing the fingerprint. If a datase transform changes (or at least if the output data
|
| 422 |
+
that are cached changes), then one should increase the version. If the version stays the
|
| 423 |
+
same, then old cached data could be reused that are not compatible with the new transform.
|
| 424 |
+
It should be in the format "MAJOR.MINOR.PATCH".
|
| 425 |
+
"""
|
| 426 |
+
|
| 427 |
+
if use_kwargs is not None and not isinstance(use_kwargs, list):
|
| 428 |
+
raise ValueError(f"use_kwargs is supposed to be a list, not {type(use_kwargs)}")
|
| 429 |
+
|
| 430 |
+
if ignore_kwargs is not None and not isinstance(ignore_kwargs, list):
|
| 431 |
+
raise ValueError(f"ignore_kwargs is supposed to be a list, not {type(use_kwargs)}")
|
| 432 |
+
|
| 433 |
+
if inplace and fingerprint_names:
|
| 434 |
+
raise ValueError("fingerprint_names are only used when inplace is False")
|
| 435 |
+
|
| 436 |
+
fingerprint_names = fingerprint_names if fingerprint_names is not None else ["new_fingerprint"]
|
| 437 |
+
|
| 438 |
+
def _fingerprint(func):
|
| 439 |
+
if not inplace and not all(name in func.__code__.co_varnames for name in fingerprint_names):
|
| 440 |
+
raise ValueError(f"function {func} is missing parameters {fingerprint_names} in signature")
|
| 441 |
+
|
| 442 |
+
if randomized_function: # randomized function have seed and generator parameters
|
| 443 |
+
if "seed" not in func.__code__.co_varnames:
|
| 444 |
+
raise ValueError(f"'seed' must be in {func}'s signature")
|
| 445 |
+
if "generator" not in func.__code__.co_varnames:
|
| 446 |
+
raise ValueError(f"'generator' must be in {func}'s signature")
|
| 447 |
+
# this call has to be outside the wrapper or since __qualname__ changes in multiprocessing
|
| 448 |
+
transform = format_transform_for_fingerprint(func, version=version)
|
| 449 |
+
|
| 450 |
+
@wraps(func)
|
| 451 |
+
def wrapper(*args, **kwargs):
|
| 452 |
+
kwargs_for_fingerprint = format_kwargs_for_fingerprint(
|
| 453 |
+
func,
|
| 454 |
+
args,
|
| 455 |
+
kwargs,
|
| 456 |
+
use_kwargs=use_kwargs,
|
| 457 |
+
ignore_kwargs=ignore_kwargs,
|
| 458 |
+
randomized_function=randomized_function,
|
| 459 |
+
)
|
| 460 |
+
|
| 461 |
+
if args:
|
| 462 |
+
dataset: Dataset = args[0]
|
| 463 |
+
args = args[1:]
|
| 464 |
+
else:
|
| 465 |
+
dataset: Dataset = kwargs.pop(next(iter(inspect.signature(func).parameters)))
|
| 466 |
+
|
| 467 |
+
# compute new_fingerprint and add it to the args of not in-place transforms
|
| 468 |
+
if inplace:
|
| 469 |
+
new_fingerprint = update_fingerprint(dataset._fingerprint, transform, kwargs_for_fingerprint)
|
| 470 |
+
else:
|
| 471 |
+
for fingerprint_name in fingerprint_names: # transforms like `train_test_split` have several hashes
|
| 472 |
+
if kwargs.get(fingerprint_name) is None:
|
| 473 |
+
kwargs_for_fingerprint["fingerprint_name"] = fingerprint_name
|
| 474 |
+
kwargs[fingerprint_name] = update_fingerprint(
|
| 475 |
+
dataset._fingerprint, transform, kwargs_for_fingerprint
|
| 476 |
+
)
|
| 477 |
+
else:
|
| 478 |
+
validate_fingerprint(kwargs[fingerprint_name])
|
| 479 |
+
|
| 480 |
+
# Call actual function
|
| 481 |
+
|
| 482 |
+
out = func(dataset, *args, **kwargs)
|
| 483 |
+
|
| 484 |
+
# Update fingerprint of in-place transforms + update in-place history of transforms
|
| 485 |
+
|
| 486 |
+
if inplace: # update after calling func so that the fingerprint doesn't change if the function fails
|
| 487 |
+
dataset._fingerprint = new_fingerprint
|
| 488 |
+
|
| 489 |
+
return out
|
| 490 |
+
|
| 491 |
+
wrapper._decorator_name_ = "fingerprint"
|
| 492 |
+
return wrapper
|
| 493 |
+
|
| 494 |
+
return _fingerprint
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/hub.py
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
from itertools import chain
|
| 3 |
+
from typing import Optional, Union
|
| 4 |
+
|
| 5 |
+
from huggingface_hub import (
|
| 6 |
+
CommitInfo,
|
| 7 |
+
CommitOperationAdd,
|
| 8 |
+
CommitOperationDelete,
|
| 9 |
+
DatasetCard,
|
| 10 |
+
DatasetCardData,
|
| 11 |
+
HfApi,
|
| 12 |
+
HfFileSystem,
|
| 13 |
+
)
|
| 14 |
+
from huggingface_hub.utils import HfHubHTTPError
|
| 15 |
+
|
| 16 |
+
import datasets.config
|
| 17 |
+
from datasets.info import DatasetInfosDict
|
| 18 |
+
from datasets.inspect import get_dataset_config_names, get_dataset_default_config_name
|
| 19 |
+
from datasets.load import load_dataset, load_dataset_builder
|
| 20 |
+
from datasets.utils.metadata import MetadataConfigs
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def convert_to_parquet(
|
| 24 |
+
repo_id: str,
|
| 25 |
+
revision: Optional[str] = None,
|
| 26 |
+
token: Optional[Union[bool, str]] = None,
|
| 27 |
+
trust_remote_code: Optional[bool] = None,
|
| 28 |
+
) -> CommitInfo:
|
| 29 |
+
"""Convert Hub [script-based dataset](dataset_script) to Parquet [data-only dataset](repository_structure), so that
|
| 30 |
+
the dataset viewer will be supported.
|
| 31 |
+
|
| 32 |
+
This function:
|
| 33 |
+
- makes a copy of the script on the "main" branch into a dedicated branch called "script" (if it does not already exist)
|
| 34 |
+
- creates a pull request to the Hub dataset to convert it to Parquet files (and deletes the script from the main branch)
|
| 35 |
+
|
| 36 |
+
If in the future you need to recreate the Parquet files from the "script" branch, pass the `revision="script"` argument.
|
| 37 |
+
|
| 38 |
+
Note that you should pass the `trust_remote_code=True` argument only if you trust the remote code to be executed locally on your machine.
|
| 39 |
+
|
| 40 |
+
Args:
|
| 41 |
+
repo_id (`str`): ID of the source Hub dataset repository, in the following format: `<user>/<dataset_name>` or
|
| 42 |
+
`<org>/<dataset_name>`.
|
| 43 |
+
revision (`str`, *optional*): Branch of the source Hub dataset repository. Defaults to the `"main"` branch.
|
| 44 |
+
token (`bool` or `str`, *optional*): Authentication token for the Hugging Face Hub.
|
| 45 |
+
trust_remote_code (`bool`, defaults to `True`): Whether you trust the remote code of the Hub script-based
|
| 46 |
+
dataset to be executed locally on your machine. This option should only be set to `True` for repositories
|
| 47 |
+
where you have read the code and which you trust.
|
| 48 |
+
|
| 49 |
+
<Tip warning={true}>
|
| 50 |
+
|
| 51 |
+
`trust_remote_code` will default to False in the next major release.
|
| 52 |
+
|
| 53 |
+
</Tip>
|
| 54 |
+
|
| 55 |
+
Returns:
|
| 56 |
+
`huggingface_hub.CommitInfo`
|
| 57 |
+
"""
|
| 58 |
+
print(f"{repo_id}")
|
| 59 |
+
configs = get_dataset_config_names(repo_id, token=token, revision=revision, trust_remote_code=trust_remote_code)
|
| 60 |
+
print(f"{configs = }")
|
| 61 |
+
default_config = get_dataset_default_config_name(
|
| 62 |
+
repo_id, token=token, revision=revision, trust_remote_code=trust_remote_code
|
| 63 |
+
)
|
| 64 |
+
print(f"{default_config = }")
|
| 65 |
+
if default_config:
|
| 66 |
+
config = default_config
|
| 67 |
+
configs.remove(default_config)
|
| 68 |
+
else:
|
| 69 |
+
config = configs.pop(0)
|
| 70 |
+
print(f"{config = }")
|
| 71 |
+
dataset = load_dataset(repo_id, config, revision=revision, trust_remote_code=trust_remote_code)
|
| 72 |
+
commit_info = dataset.push_to_hub(
|
| 73 |
+
repo_id,
|
| 74 |
+
config_name=config,
|
| 75 |
+
commit_message="Convert dataset to Parquet",
|
| 76 |
+
commit_description="Convert dataset to Parquet.",
|
| 77 |
+
create_pr=True,
|
| 78 |
+
token=token,
|
| 79 |
+
set_default=default_config is not None,
|
| 80 |
+
)
|
| 81 |
+
time.sleep(5)
|
| 82 |
+
pr_revision, pr_url = commit_info.pr_revision, commit_info.pr_url
|
| 83 |
+
for config in configs:
|
| 84 |
+
print(f"{config = }")
|
| 85 |
+
dataset = load_dataset(repo_id, config, revision=revision, trust_remote_code=trust_remote_code)
|
| 86 |
+
dataset.push_to_hub(
|
| 87 |
+
repo_id,
|
| 88 |
+
config_name=config,
|
| 89 |
+
commit_message=f"Add '{config}' config data files",
|
| 90 |
+
revision=pr_revision,
|
| 91 |
+
token=token,
|
| 92 |
+
)
|
| 93 |
+
time.sleep(5)
|
| 94 |
+
_delete_files(repo_id, revision=pr_revision, token=token)
|
| 95 |
+
if not revision:
|
| 96 |
+
api = HfApi(endpoint=datasets.config.HF_ENDPOINT, token=token)
|
| 97 |
+
try:
|
| 98 |
+
api.create_branch(repo_id, branch="script", repo_type="dataset", token=token, exist_ok=True)
|
| 99 |
+
except HfHubHTTPError:
|
| 100 |
+
pass
|
| 101 |
+
print(f"You can find your PR to convert the dataset to Parquet at: {pr_url}")
|
| 102 |
+
return commit_info
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def delete_from_hub(
|
| 106 |
+
repo_id: str,
|
| 107 |
+
config_name: str,
|
| 108 |
+
revision: Optional[str] = None,
|
| 109 |
+
token: Optional[Union[bool, str]] = None,
|
| 110 |
+
) -> CommitInfo:
|
| 111 |
+
"""Delete a dataset configuration from a [data-only dataset](repository_structure) on the Hub.
|
| 112 |
+
|
| 113 |
+
Args:
|
| 114 |
+
repo_id (`str`): ID of the Hub dataset repository, in the following format: `<user>/<dataset_name>` or
|
| 115 |
+
`<org>/<dataset_name>`.
|
| 116 |
+
config_name (`str`): Name of the dataset configuration.
|
| 117 |
+
revision (`str`, *optional*): Branch to delete the configuration from. Defaults to the `"main"` branch.
|
| 118 |
+
token (`bool` or `str`, *optional*): Authentication token for the Hugging Face Hub.
|
| 119 |
+
|
| 120 |
+
Returns:
|
| 121 |
+
`huggingface_hub.CommitInfo`
|
| 122 |
+
"""
|
| 123 |
+
operations = []
|
| 124 |
+
# data_files
|
| 125 |
+
fs = HfFileSystem(endpoint=datasets.config.HF_ENDPOINT, token=token)
|
| 126 |
+
builder = load_dataset_builder(repo_id, config_name, revision=revision, token=token, trust_remote_code=False)
|
| 127 |
+
for data_file in chain(*builder.config.data_files.values()):
|
| 128 |
+
data_file_resolved_path = fs.resolve_path(data_file)
|
| 129 |
+
if data_file_resolved_path.repo_id == repo_id:
|
| 130 |
+
operations.append(CommitOperationDelete(path_in_repo=data_file_resolved_path.path_in_repo))
|
| 131 |
+
# README.md
|
| 132 |
+
dataset_card = DatasetCard.load(repo_id)
|
| 133 |
+
# config_names
|
| 134 |
+
if dataset_card.data.get("config_names", None) and config_name in dataset_card.data["config_names"]:
|
| 135 |
+
dataset_card.data["config_names"].remove(config_name)
|
| 136 |
+
# metadata_configs
|
| 137 |
+
metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card.data)
|
| 138 |
+
if metadata_configs:
|
| 139 |
+
_ = metadata_configs.pop(config_name, None)
|
| 140 |
+
dataset_card_data = DatasetCardData()
|
| 141 |
+
metadata_configs.to_dataset_card_data(dataset_card_data)
|
| 142 |
+
if datasets.config.METADATA_CONFIGS_FIELD in dataset_card_data:
|
| 143 |
+
dataset_card.data[datasets.config.METADATA_CONFIGS_FIELD] = dataset_card_data[
|
| 144 |
+
datasets.config.METADATA_CONFIGS_FIELD
|
| 145 |
+
]
|
| 146 |
+
else:
|
| 147 |
+
_ = dataset_card.data.pop(datasets.config.METADATA_CONFIGS_FIELD, None)
|
| 148 |
+
# dataset_info
|
| 149 |
+
dataset_infos: DatasetInfosDict = DatasetInfosDict.from_dataset_card_data(dataset_card.data)
|
| 150 |
+
if dataset_infos:
|
| 151 |
+
_ = dataset_infos.pop(config_name, None)
|
| 152 |
+
dataset_card_data = DatasetCardData()
|
| 153 |
+
dataset_infos.to_dataset_card_data(dataset_card_data)
|
| 154 |
+
if "dataset_info" in dataset_card_data:
|
| 155 |
+
dataset_card.data["dataset_info"] = dataset_card_data["dataset_info"]
|
| 156 |
+
else:
|
| 157 |
+
_ = dataset_card.data.pop("dataset_info", None)
|
| 158 |
+
# Commit
|
| 159 |
+
operations.append(
|
| 160 |
+
CommitOperationAdd(path_in_repo=datasets.config.REPOCARD_FILENAME, path_or_fileobj=str(dataset_card).encode())
|
| 161 |
+
)
|
| 162 |
+
api = HfApi(endpoint=datasets.config.HF_ENDPOINT, token=token)
|
| 163 |
+
commit_info = api.create_commit(
|
| 164 |
+
repo_id,
|
| 165 |
+
operations=operations,
|
| 166 |
+
commit_message=f"Delete '{config_name}' config",
|
| 167 |
+
commit_description=f"Delete '{config_name}' config.",
|
| 168 |
+
token=token,
|
| 169 |
+
repo_type="dataset",
|
| 170 |
+
revision=revision,
|
| 171 |
+
create_pr=True,
|
| 172 |
+
)
|
| 173 |
+
print(f"You can find your PR to delete the dataset config at: {commit_info.pr_url}")
|
| 174 |
+
return commit_info
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def _delete_files(dataset_id, revision=None, token=None):
|
| 178 |
+
dataset_name = dataset_id.split("/")[-1]
|
| 179 |
+
hf_api = HfApi(endpoint=datasets.config.HF_ENDPOINT, token=token)
|
| 180 |
+
repo_files = hf_api.list_repo_files(
|
| 181 |
+
dataset_id,
|
| 182 |
+
repo_type="dataset",
|
| 183 |
+
)
|
| 184 |
+
if repo_files:
|
| 185 |
+
legacy_json_file = []
|
| 186 |
+
python_files = []
|
| 187 |
+
data_files = []
|
| 188 |
+
for filename in repo_files:
|
| 189 |
+
if filename in {".gitattributes", "README.md"}:
|
| 190 |
+
continue
|
| 191 |
+
elif filename == f"{dataset_name}.py":
|
| 192 |
+
hf_api.delete_file(
|
| 193 |
+
filename,
|
| 194 |
+
dataset_id,
|
| 195 |
+
repo_type="dataset",
|
| 196 |
+
revision=revision,
|
| 197 |
+
commit_message="Delete loading script",
|
| 198 |
+
)
|
| 199 |
+
elif filename == "dataset_infos.json":
|
| 200 |
+
legacy_json_file.append(filename)
|
| 201 |
+
elif filename.endswith(".py"):
|
| 202 |
+
python_files.append(filename)
|
| 203 |
+
else:
|
| 204 |
+
data_files.append(filename)
|
| 205 |
+
if legacy_json_file:
|
| 206 |
+
hf_api.delete_file(
|
| 207 |
+
"dataset_infos.json",
|
| 208 |
+
dataset_id,
|
| 209 |
+
repo_type="dataset",
|
| 210 |
+
revision=revision,
|
| 211 |
+
commit_message="Delete legacy dataset_infos.json",
|
| 212 |
+
)
|
| 213 |
+
if python_files:
|
| 214 |
+
for filename in python_files:
|
| 215 |
+
hf_api.delete_file(
|
| 216 |
+
filename,
|
| 217 |
+
dataset_id,
|
| 218 |
+
repo_type="dataset",
|
| 219 |
+
revision=revision,
|
| 220 |
+
commit_message="Delete loading script auxiliary file",
|
| 221 |
+
)
|
| 222 |
+
if data_files:
|
| 223 |
+
for filename in data_files:
|
| 224 |
+
hf_api.delete_file(
|
| 225 |
+
filename,
|
| 226 |
+
dataset_id,
|
| 227 |
+
repo_type="dataset",
|
| 228 |
+
revision=revision,
|
| 229 |
+
commit_message="Delete data file",
|
| 230 |
+
)
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/info.py
ADDED
|
@@ -0,0 +1,593 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
# Lint as: python3
|
| 16 |
+
"""DatasetInfo and MetricInfo record information we know about a dataset and a metric.
|
| 17 |
+
|
| 18 |
+
This includes things that we know about the dataset statically, i.e.:
|
| 19 |
+
- description
|
| 20 |
+
- canonical location
|
| 21 |
+
- does it have validation and tests splits
|
| 22 |
+
- size
|
| 23 |
+
- etc.
|
| 24 |
+
|
| 25 |
+
This also includes the things that can and should be computed once we've
|
| 26 |
+
processed the dataset as well:
|
| 27 |
+
- number of examples (in each split)
|
| 28 |
+
- etc.
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
import copy
|
| 32 |
+
import dataclasses
|
| 33 |
+
import json
|
| 34 |
+
import os
|
| 35 |
+
import posixpath
|
| 36 |
+
import warnings
|
| 37 |
+
from dataclasses import dataclass
|
| 38 |
+
from pathlib import Path
|
| 39 |
+
from typing import ClassVar, Dict, List, Optional, Union
|
| 40 |
+
|
| 41 |
+
import fsspec
|
| 42 |
+
from fsspec.core import url_to_fs
|
| 43 |
+
from huggingface_hub import DatasetCard, DatasetCardData
|
| 44 |
+
|
| 45 |
+
from . import config
|
| 46 |
+
from .features import Features, Value
|
| 47 |
+
from .splits import SplitDict
|
| 48 |
+
from .tasks import TaskTemplate, task_template_from_dict
|
| 49 |
+
from .utils import Version
|
| 50 |
+
from .utils.logging import get_logger
|
| 51 |
+
from .utils.py_utils import asdict, unique_values
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
logger = get_logger(__name__)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
@dataclass
|
| 58 |
+
class SupervisedKeysData:
|
| 59 |
+
input: str = ""
|
| 60 |
+
output: str = ""
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
@dataclass
|
| 64 |
+
class DownloadChecksumsEntryData:
|
| 65 |
+
key: str = ""
|
| 66 |
+
value: str = ""
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
class MissingCachedSizesConfigError(Exception):
|
| 70 |
+
"""The expected cached sizes of the download file are missing."""
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
class NonMatchingCachedSizesError(Exception):
|
| 74 |
+
"""The prepared split doesn't have expected sizes."""
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
@dataclass
|
| 78 |
+
class PostProcessedInfo:
|
| 79 |
+
features: Optional[Features] = None
|
| 80 |
+
resources_checksums: Optional[dict] = None
|
| 81 |
+
|
| 82 |
+
def __post_init__(self):
|
| 83 |
+
# Convert back to the correct classes when we reload from dict
|
| 84 |
+
if self.features is not None and not isinstance(self.features, Features):
|
| 85 |
+
self.features = Features.from_dict(self.features)
|
| 86 |
+
|
| 87 |
+
@classmethod
|
| 88 |
+
def from_dict(cls, post_processed_info_dict: dict) -> "PostProcessedInfo":
|
| 89 |
+
field_names = {f.name for f in dataclasses.fields(cls)}
|
| 90 |
+
return cls(**{k: v for k, v in post_processed_info_dict.items() if k in field_names})
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
@dataclass
|
| 94 |
+
class DatasetInfo:
|
| 95 |
+
"""Information about a dataset.
|
| 96 |
+
|
| 97 |
+
`DatasetInfo` documents datasets, including its name, version, and features.
|
| 98 |
+
See the constructor arguments and properties for a full list.
|
| 99 |
+
|
| 100 |
+
Not all fields are known on construction and may be updated later.
|
| 101 |
+
|
| 102 |
+
Attributes:
|
| 103 |
+
description (`str`):
|
| 104 |
+
A description of the dataset.
|
| 105 |
+
citation (`str`):
|
| 106 |
+
A BibTeX citation of the dataset.
|
| 107 |
+
homepage (`str`):
|
| 108 |
+
A URL to the official homepage for the dataset.
|
| 109 |
+
license (`str`):
|
| 110 |
+
The dataset's license. It can be the name of the license or a paragraph containing the terms of the license.
|
| 111 |
+
features ([`Features`], *optional*):
|
| 112 |
+
The features used to specify the dataset's column types.
|
| 113 |
+
post_processed (`PostProcessedInfo`, *optional*):
|
| 114 |
+
Information regarding the resources of a possible post-processing of a dataset. For example, it can contain the information of an index.
|
| 115 |
+
supervised_keys (`SupervisedKeysData`, *optional*):
|
| 116 |
+
Specifies the input feature and the label for supervised learning if applicable for the dataset (legacy from TFDS).
|
| 117 |
+
builder_name (`str`, *optional*):
|
| 118 |
+
The name of the `GeneratorBasedBuilder` subclass used to create the dataset. Usually matched to the corresponding script name. It is also the snake_case version of the dataset builder class name.
|
| 119 |
+
config_name (`str`, *optional*):
|
| 120 |
+
The name of the configuration derived from [`BuilderConfig`].
|
| 121 |
+
version (`str` or [`Version`], *optional*):
|
| 122 |
+
The version of the dataset.
|
| 123 |
+
splits (`dict`, *optional*):
|
| 124 |
+
The mapping between split name and metadata.
|
| 125 |
+
download_checksums (`dict`, *optional*):
|
| 126 |
+
The mapping between the URL to download the dataset's checksums and corresponding metadata.
|
| 127 |
+
download_size (`int`, *optional*):
|
| 128 |
+
The size of the files to download to generate the dataset, in bytes.
|
| 129 |
+
post_processing_size (`int`, *optional*):
|
| 130 |
+
Size of the dataset in bytes after post-processing, if any.
|
| 131 |
+
dataset_size (`int`, *optional*):
|
| 132 |
+
The combined size in bytes of the Arrow tables for all splits.
|
| 133 |
+
size_in_bytes (`int`, *optional*):
|
| 134 |
+
The combined size in bytes of all files associated with the dataset (downloaded files + Arrow files).
|
| 135 |
+
task_templates (`List[TaskTemplate]`, *optional*):
|
| 136 |
+
The task templates to prepare the dataset for during training and evaluation. Each template casts the dataset's [`Features`] to standardized column names and types as detailed in `datasets.tasks`.
|
| 137 |
+
**config_kwargs (additional keyword arguments):
|
| 138 |
+
Keyword arguments to be passed to the [`BuilderConfig`] and used in the [`DatasetBuilder`].
|
| 139 |
+
"""
|
| 140 |
+
|
| 141 |
+
# Set in the dataset scripts
|
| 142 |
+
description: str = dataclasses.field(default_factory=str)
|
| 143 |
+
citation: str = dataclasses.field(default_factory=str)
|
| 144 |
+
homepage: str = dataclasses.field(default_factory=str)
|
| 145 |
+
license: str = dataclasses.field(default_factory=str)
|
| 146 |
+
features: Optional[Features] = None
|
| 147 |
+
post_processed: Optional[PostProcessedInfo] = None
|
| 148 |
+
supervised_keys: Optional[SupervisedKeysData] = None
|
| 149 |
+
task_templates: Optional[List[TaskTemplate]] = None
|
| 150 |
+
|
| 151 |
+
# Set later by the builder
|
| 152 |
+
builder_name: Optional[str] = None
|
| 153 |
+
dataset_name: Optional[str] = None # for packaged builders, to be different from builder_name
|
| 154 |
+
config_name: Optional[str] = None
|
| 155 |
+
version: Optional[Union[str, Version]] = None
|
| 156 |
+
# Set later by `download_and_prepare`
|
| 157 |
+
splits: Optional[dict] = None
|
| 158 |
+
download_checksums: Optional[dict] = None
|
| 159 |
+
download_size: Optional[int] = None
|
| 160 |
+
post_processing_size: Optional[int] = None
|
| 161 |
+
dataset_size: Optional[int] = None
|
| 162 |
+
size_in_bytes: Optional[int] = None
|
| 163 |
+
|
| 164 |
+
_INCLUDED_INFO_IN_YAML: ClassVar[List[str]] = [
|
| 165 |
+
"config_name",
|
| 166 |
+
"download_size",
|
| 167 |
+
"dataset_size",
|
| 168 |
+
"features",
|
| 169 |
+
"splits",
|
| 170 |
+
]
|
| 171 |
+
|
| 172 |
+
def __post_init__(self):
|
| 173 |
+
# Convert back to the correct classes when we reload from dict
|
| 174 |
+
if self.features is not None and not isinstance(self.features, Features):
|
| 175 |
+
self.features = Features.from_dict(self.features)
|
| 176 |
+
if self.post_processed is not None and not isinstance(self.post_processed, PostProcessedInfo):
|
| 177 |
+
self.post_processed = PostProcessedInfo.from_dict(self.post_processed)
|
| 178 |
+
if self.version is not None and not isinstance(self.version, Version):
|
| 179 |
+
if isinstance(self.version, str):
|
| 180 |
+
self.version = Version(self.version)
|
| 181 |
+
else:
|
| 182 |
+
self.version = Version.from_dict(self.version)
|
| 183 |
+
if self.splits is not None and not isinstance(self.splits, SplitDict):
|
| 184 |
+
self.splits = SplitDict.from_split_dict(self.splits)
|
| 185 |
+
if self.supervised_keys is not None and not isinstance(self.supervised_keys, SupervisedKeysData):
|
| 186 |
+
if isinstance(self.supervised_keys, (tuple, list)):
|
| 187 |
+
self.supervised_keys = SupervisedKeysData(*self.supervised_keys)
|
| 188 |
+
else:
|
| 189 |
+
self.supervised_keys = SupervisedKeysData(**self.supervised_keys)
|
| 190 |
+
|
| 191 |
+
# Parse and make a list of templates
|
| 192 |
+
if self.task_templates is not None:
|
| 193 |
+
if isinstance(self.task_templates, (list, tuple)):
|
| 194 |
+
templates = [
|
| 195 |
+
template if isinstance(template, TaskTemplate) else task_template_from_dict(template)
|
| 196 |
+
for template in self.task_templates
|
| 197 |
+
]
|
| 198 |
+
self.task_templates = [template for template in templates if template is not None]
|
| 199 |
+
elif isinstance(self.task_templates, TaskTemplate):
|
| 200 |
+
self.task_templates = [self.task_templates]
|
| 201 |
+
else:
|
| 202 |
+
template = task_template_from_dict(self.task_templates)
|
| 203 |
+
self.task_templates = [template] if template is not None else []
|
| 204 |
+
|
| 205 |
+
# Align task templates with features
|
| 206 |
+
if self.task_templates is not None:
|
| 207 |
+
self.task_templates = list(self.task_templates)
|
| 208 |
+
if self.features is not None:
|
| 209 |
+
self.task_templates = [
|
| 210 |
+
template.align_with_features(self.features) for template in (self.task_templates)
|
| 211 |
+
]
|
| 212 |
+
|
| 213 |
+
def write_to_directory(
|
| 214 |
+
self, dataset_info_dir, pretty_print=False, fs="deprecated", storage_options: Optional[dict] = None
|
| 215 |
+
):
|
| 216 |
+
"""Write `DatasetInfo` and license (if present) as JSON files to `dataset_info_dir`.
|
| 217 |
+
|
| 218 |
+
Args:
|
| 219 |
+
dataset_info_dir (`str`):
|
| 220 |
+
Destination directory.
|
| 221 |
+
pretty_print (`bool`, defaults to `False`):
|
| 222 |
+
If `True`, the JSON will be pretty-printed with the indent level of 4.
|
| 223 |
+
fs (`fsspec.spec.AbstractFileSystem`, *optional*):
|
| 224 |
+
Instance of the remote filesystem used to download the files from.
|
| 225 |
+
|
| 226 |
+
<Deprecated version="2.9.0">
|
| 227 |
+
|
| 228 |
+
`fs` was deprecated in version 2.9.0 and will be removed in 3.0.0.
|
| 229 |
+
Please use `storage_options` instead, e.g. `storage_options=fs.storage_options`.
|
| 230 |
+
|
| 231 |
+
</Deprecated>
|
| 232 |
+
|
| 233 |
+
storage_options (`dict`, *optional*):
|
| 234 |
+
Key/value pairs to be passed on to the file-system backend, if any.
|
| 235 |
+
|
| 236 |
+
<Added version="2.9.0"/>
|
| 237 |
+
|
| 238 |
+
Example:
|
| 239 |
+
|
| 240 |
+
```py
|
| 241 |
+
>>> from datasets import load_dataset
|
| 242 |
+
>>> ds = load_dataset("rotten_tomatoes", split="validation")
|
| 243 |
+
>>> ds.info.write_to_directory("/path/to/directory/")
|
| 244 |
+
```
|
| 245 |
+
"""
|
| 246 |
+
if fs != "deprecated":
|
| 247 |
+
warnings.warn(
|
| 248 |
+
"'fs' was deprecated in favor of 'storage_options' in version 2.9.0 and will be removed in 3.0.0.\n"
|
| 249 |
+
"You can remove this warning by passing 'storage_options=fs.storage_options' instead.",
|
| 250 |
+
FutureWarning,
|
| 251 |
+
)
|
| 252 |
+
storage_options = fs.storage_options
|
| 253 |
+
|
| 254 |
+
fs: fsspec.AbstractFileSystem
|
| 255 |
+
fs, *_ = url_to_fs(dataset_info_dir, **(storage_options or {}))
|
| 256 |
+
with fs.open(posixpath.join(dataset_info_dir, config.DATASET_INFO_FILENAME), "wb") as f:
|
| 257 |
+
self._dump_info(f, pretty_print=pretty_print)
|
| 258 |
+
if self.license:
|
| 259 |
+
with fs.open(posixpath.join(dataset_info_dir, config.LICENSE_FILENAME), "wb") as f:
|
| 260 |
+
self._dump_license(f)
|
| 261 |
+
|
| 262 |
+
def _dump_info(self, file, pretty_print=False):
|
| 263 |
+
"""Dump info in `file` file-like object open in bytes mode (to support remote files)"""
|
| 264 |
+
file.write(json.dumps(asdict(self), indent=4 if pretty_print else None).encode("utf-8"))
|
| 265 |
+
|
| 266 |
+
def _dump_license(self, file):
|
| 267 |
+
"""Dump license in `file` file-like object open in bytes mode (to support remote files)"""
|
| 268 |
+
file.write(self.license.encode("utf-8"))
|
| 269 |
+
|
| 270 |
+
@classmethod
|
| 271 |
+
def from_merge(cls, dataset_infos: List["DatasetInfo"]):
|
| 272 |
+
dataset_infos = [dset_info.copy() for dset_info in dataset_infos if dset_info is not None]
|
| 273 |
+
|
| 274 |
+
if len(dataset_infos) > 0 and all(dataset_infos[0] == dset_info for dset_info in dataset_infos):
|
| 275 |
+
# if all dataset_infos are equal we don't need to merge. Just return the first.
|
| 276 |
+
return dataset_infos[0]
|
| 277 |
+
|
| 278 |
+
description = "\n\n".join(unique_values(info.description for info in dataset_infos)).strip()
|
| 279 |
+
citation = "\n\n".join(unique_values(info.citation for info in dataset_infos)).strip()
|
| 280 |
+
homepage = "\n\n".join(unique_values(info.homepage for info in dataset_infos)).strip()
|
| 281 |
+
license = "\n\n".join(unique_values(info.license for info in dataset_infos)).strip()
|
| 282 |
+
features = None
|
| 283 |
+
supervised_keys = None
|
| 284 |
+
task_templates = None
|
| 285 |
+
|
| 286 |
+
# Find common task templates across all dataset infos
|
| 287 |
+
all_task_templates = [info.task_templates for info in dataset_infos if info.task_templates is not None]
|
| 288 |
+
if len(all_task_templates) > 1:
|
| 289 |
+
task_templates = list(set(all_task_templates[0]).intersection(*all_task_templates[1:]))
|
| 290 |
+
elif len(all_task_templates):
|
| 291 |
+
task_templates = list(set(all_task_templates[0]))
|
| 292 |
+
# If no common task templates found, replace empty list with None
|
| 293 |
+
task_templates = task_templates if task_templates else None
|
| 294 |
+
|
| 295 |
+
return cls(
|
| 296 |
+
description=description,
|
| 297 |
+
citation=citation,
|
| 298 |
+
homepage=homepage,
|
| 299 |
+
license=license,
|
| 300 |
+
features=features,
|
| 301 |
+
supervised_keys=supervised_keys,
|
| 302 |
+
task_templates=task_templates,
|
| 303 |
+
)
|
| 304 |
+
|
| 305 |
+
@classmethod
|
| 306 |
+
def from_directory(
|
| 307 |
+
cls, dataset_info_dir: str, fs="deprecated", storage_options: Optional[dict] = None
|
| 308 |
+
) -> "DatasetInfo":
|
| 309 |
+
"""Create [`DatasetInfo`] from the JSON file in `dataset_info_dir`.
|
| 310 |
+
|
| 311 |
+
This function updates all the dynamically generated fields (num_examples,
|
| 312 |
+
hash, time of creation,...) of the [`DatasetInfo`].
|
| 313 |
+
|
| 314 |
+
This will overwrite all previous metadata.
|
| 315 |
+
|
| 316 |
+
Args:
|
| 317 |
+
dataset_info_dir (`str`):
|
| 318 |
+
The directory containing the metadata file. This
|
| 319 |
+
should be the root directory of a specific dataset version.
|
| 320 |
+
fs (`fsspec.spec.AbstractFileSystem`, *optional*):
|
| 321 |
+
Instance of the remote filesystem used to download the files from.
|
| 322 |
+
|
| 323 |
+
<Deprecated version="2.9.0">
|
| 324 |
+
|
| 325 |
+
`fs` was deprecated in version 2.9.0 and will be removed in 3.0.0.
|
| 326 |
+
Please use `storage_options` instead, e.g. `storage_options=fs.storage_options`.
|
| 327 |
+
|
| 328 |
+
</Deprecated>
|
| 329 |
+
|
| 330 |
+
storage_options (`dict`, *optional*):
|
| 331 |
+
Key/value pairs to be passed on to the file-system backend, if any.
|
| 332 |
+
|
| 333 |
+
<Added version="2.9.0"/>
|
| 334 |
+
|
| 335 |
+
Example:
|
| 336 |
+
|
| 337 |
+
```py
|
| 338 |
+
>>> from datasets import DatasetInfo
|
| 339 |
+
>>> ds_info = DatasetInfo.from_directory("/path/to/directory/")
|
| 340 |
+
```
|
| 341 |
+
"""
|
| 342 |
+
if fs != "deprecated":
|
| 343 |
+
warnings.warn(
|
| 344 |
+
"'fs' was deprecated in favor of 'storage_options' in version 2.9.0 and will be removed in 3.0.0.\n"
|
| 345 |
+
"You can remove this warning by passing 'storage_options=fs.storage_options' instead.",
|
| 346 |
+
FutureWarning,
|
| 347 |
+
)
|
| 348 |
+
storage_options = fs.storage_options
|
| 349 |
+
|
| 350 |
+
fs: fsspec.AbstractFileSystem
|
| 351 |
+
fs, *_ = url_to_fs(dataset_info_dir, **(storage_options or {}))
|
| 352 |
+
logger.info(f"Loading Dataset info from {dataset_info_dir}")
|
| 353 |
+
if not dataset_info_dir:
|
| 354 |
+
raise ValueError("Calling DatasetInfo.from_directory() with undefined dataset_info_dir.")
|
| 355 |
+
with fs.open(posixpath.join(dataset_info_dir, config.DATASET_INFO_FILENAME), "r", encoding="utf-8") as f:
|
| 356 |
+
dataset_info_dict = json.load(f)
|
| 357 |
+
return cls.from_dict(dataset_info_dict)
|
| 358 |
+
|
| 359 |
+
@classmethod
|
| 360 |
+
def from_dict(cls, dataset_info_dict: dict) -> "DatasetInfo":
|
| 361 |
+
field_names = {f.name for f in dataclasses.fields(cls)}
|
| 362 |
+
return cls(**{k: v for k, v in dataset_info_dict.items() if k in field_names})
|
| 363 |
+
|
| 364 |
+
def update(self, other_dataset_info: "DatasetInfo", ignore_none=True):
|
| 365 |
+
self_dict = self.__dict__
|
| 366 |
+
self_dict.update(
|
| 367 |
+
**{
|
| 368 |
+
k: copy.deepcopy(v)
|
| 369 |
+
for k, v in other_dataset_info.__dict__.items()
|
| 370 |
+
if (v is not None or not ignore_none)
|
| 371 |
+
}
|
| 372 |
+
)
|
| 373 |
+
|
| 374 |
+
def copy(self) -> "DatasetInfo":
|
| 375 |
+
return self.__class__(**{k: copy.deepcopy(v) for k, v in self.__dict__.items()})
|
| 376 |
+
|
| 377 |
+
def _to_yaml_dict(self) -> dict:
|
| 378 |
+
yaml_dict = {}
|
| 379 |
+
dataset_info_dict = asdict(self)
|
| 380 |
+
for key in dataset_info_dict:
|
| 381 |
+
if key in self._INCLUDED_INFO_IN_YAML:
|
| 382 |
+
value = getattr(self, key)
|
| 383 |
+
if hasattr(value, "_to_yaml_list"): # Features, SplitDict
|
| 384 |
+
yaml_dict[key] = value._to_yaml_list()
|
| 385 |
+
elif hasattr(value, "_to_yaml_string"): # Version
|
| 386 |
+
yaml_dict[key] = value._to_yaml_string()
|
| 387 |
+
else:
|
| 388 |
+
yaml_dict[key] = value
|
| 389 |
+
return yaml_dict
|
| 390 |
+
|
| 391 |
+
@classmethod
|
| 392 |
+
def _from_yaml_dict(cls, yaml_data: dict) -> "DatasetInfo":
|
| 393 |
+
yaml_data = copy.deepcopy(yaml_data)
|
| 394 |
+
if yaml_data.get("features") is not None:
|
| 395 |
+
yaml_data["features"] = Features._from_yaml_list(yaml_data["features"])
|
| 396 |
+
if yaml_data.get("splits") is not None:
|
| 397 |
+
yaml_data["splits"] = SplitDict._from_yaml_list(yaml_data["splits"])
|
| 398 |
+
field_names = {f.name for f in dataclasses.fields(cls)}
|
| 399 |
+
return cls(**{k: v for k, v in yaml_data.items() if k in field_names})
|
| 400 |
+
|
| 401 |
+
|
| 402 |
+
class DatasetInfosDict(Dict[str, DatasetInfo]):
|
| 403 |
+
def write_to_directory(self, dataset_infos_dir, overwrite=False, pretty_print=False) -> None:
|
| 404 |
+
total_dataset_infos = {}
|
| 405 |
+
dataset_infos_path = os.path.join(dataset_infos_dir, config.DATASETDICT_INFOS_FILENAME)
|
| 406 |
+
dataset_readme_path = os.path.join(dataset_infos_dir, config.REPOCARD_FILENAME)
|
| 407 |
+
if not overwrite:
|
| 408 |
+
total_dataset_infos = self.from_directory(dataset_infos_dir)
|
| 409 |
+
total_dataset_infos.update(self)
|
| 410 |
+
if os.path.exists(dataset_infos_path):
|
| 411 |
+
# for backward compatibility, let's update the JSON file if it exists
|
| 412 |
+
with open(dataset_infos_path, "w", encoding="utf-8") as f:
|
| 413 |
+
dataset_infos_dict = {
|
| 414 |
+
config_name: asdict(dset_info) for config_name, dset_info in total_dataset_infos.items()
|
| 415 |
+
}
|
| 416 |
+
json.dump(dataset_infos_dict, f, indent=4 if pretty_print else None)
|
| 417 |
+
# Dump the infos in the YAML part of the README.md file
|
| 418 |
+
if os.path.exists(dataset_readme_path):
|
| 419 |
+
dataset_card = DatasetCard.load(dataset_readme_path)
|
| 420 |
+
dataset_card_data = dataset_card.data
|
| 421 |
+
else:
|
| 422 |
+
dataset_card = None
|
| 423 |
+
dataset_card_data = DatasetCardData()
|
| 424 |
+
if total_dataset_infos:
|
| 425 |
+
total_dataset_infos.to_dataset_card_data(dataset_card_data)
|
| 426 |
+
dataset_card = (
|
| 427 |
+
DatasetCard("---\n" + str(dataset_card_data) + "\n---\n") if dataset_card is None else dataset_card
|
| 428 |
+
)
|
| 429 |
+
dataset_card.save(Path(dataset_readme_path))
|
| 430 |
+
|
| 431 |
+
@classmethod
|
| 432 |
+
def from_directory(cls, dataset_infos_dir) -> "DatasetInfosDict":
|
| 433 |
+
logger.info(f"Loading Dataset Infos from {dataset_infos_dir}")
|
| 434 |
+
# Load the info from the YAML part of README.md
|
| 435 |
+
if os.path.exists(os.path.join(dataset_infos_dir, config.REPOCARD_FILENAME)):
|
| 436 |
+
dataset_card_data = DatasetCard.load(Path(dataset_infos_dir) / config.REPOCARD_FILENAME).data
|
| 437 |
+
if "dataset_info" in dataset_card_data:
|
| 438 |
+
return cls.from_dataset_card_data(dataset_card_data)
|
| 439 |
+
if os.path.exists(os.path.join(dataset_infos_dir, config.DATASETDICT_INFOS_FILENAME)):
|
| 440 |
+
# this is just to have backward compatibility with dataset_infos.json files
|
| 441 |
+
with open(os.path.join(dataset_infos_dir, config.DATASETDICT_INFOS_FILENAME), encoding="utf-8") as f:
|
| 442 |
+
return cls(
|
| 443 |
+
{
|
| 444 |
+
config_name: DatasetInfo.from_dict(dataset_info_dict)
|
| 445 |
+
for config_name, dataset_info_dict in json.load(f).items()
|
| 446 |
+
}
|
| 447 |
+
)
|
| 448 |
+
else:
|
| 449 |
+
return cls()
|
| 450 |
+
|
| 451 |
+
@classmethod
|
| 452 |
+
def from_dataset_card_data(cls, dataset_card_data: DatasetCardData) -> "DatasetInfosDict":
|
| 453 |
+
if isinstance(dataset_card_data.get("dataset_info"), (list, dict)):
|
| 454 |
+
if isinstance(dataset_card_data["dataset_info"], list):
|
| 455 |
+
return cls(
|
| 456 |
+
{
|
| 457 |
+
dataset_info_yaml_dict.get("config_name", "default"): DatasetInfo._from_yaml_dict(
|
| 458 |
+
dataset_info_yaml_dict
|
| 459 |
+
)
|
| 460 |
+
for dataset_info_yaml_dict in dataset_card_data["dataset_info"]
|
| 461 |
+
}
|
| 462 |
+
)
|
| 463 |
+
else:
|
| 464 |
+
dataset_info = DatasetInfo._from_yaml_dict(dataset_card_data["dataset_info"])
|
| 465 |
+
dataset_info.config_name = dataset_card_data["dataset_info"].get("config_name", "default")
|
| 466 |
+
return cls({dataset_info.config_name: dataset_info})
|
| 467 |
+
else:
|
| 468 |
+
return cls()
|
| 469 |
+
|
| 470 |
+
def to_dataset_card_data(self, dataset_card_data: DatasetCardData) -> None:
|
| 471 |
+
if self:
|
| 472 |
+
# first get existing metadata info
|
| 473 |
+
if "dataset_info" in dataset_card_data and isinstance(dataset_card_data["dataset_info"], dict):
|
| 474 |
+
dataset_metadata_infos = {
|
| 475 |
+
dataset_card_data["dataset_info"].get("config_name", "default"): dataset_card_data["dataset_info"]
|
| 476 |
+
}
|
| 477 |
+
elif "dataset_info" in dataset_card_data and isinstance(dataset_card_data["dataset_info"], list):
|
| 478 |
+
dataset_metadata_infos = {
|
| 479 |
+
config_metadata["config_name"]: config_metadata
|
| 480 |
+
for config_metadata in dataset_card_data["dataset_info"]
|
| 481 |
+
}
|
| 482 |
+
else:
|
| 483 |
+
dataset_metadata_infos = {}
|
| 484 |
+
# update/rewrite existing metadata info with the one to dump
|
| 485 |
+
total_dataset_infos = {
|
| 486 |
+
**dataset_metadata_infos,
|
| 487 |
+
**{config_name: dset_info._to_yaml_dict() for config_name, dset_info in self.items()},
|
| 488 |
+
}
|
| 489 |
+
# the config_name from the dataset_infos_dict takes over the config_name of the DatasetInfo
|
| 490 |
+
for config_name, dset_info_yaml_dict in total_dataset_infos.items():
|
| 491 |
+
dset_info_yaml_dict["config_name"] = config_name
|
| 492 |
+
if len(total_dataset_infos) == 1:
|
| 493 |
+
# use a struct instead of a list of configurations, since there's only one
|
| 494 |
+
dataset_card_data["dataset_info"] = next(iter(total_dataset_infos.values()))
|
| 495 |
+
config_name = dataset_card_data["dataset_info"].pop("config_name", None)
|
| 496 |
+
if config_name != "default":
|
| 497 |
+
# if config_name is not "default" preserve it and put at the first position
|
| 498 |
+
dataset_card_data["dataset_info"] = {
|
| 499 |
+
"config_name": config_name,
|
| 500 |
+
**dataset_card_data["dataset_info"],
|
| 501 |
+
}
|
| 502 |
+
else:
|
| 503 |
+
dataset_card_data["dataset_info"] = []
|
| 504 |
+
for config_name, dataset_info_yaml_dict in sorted(total_dataset_infos.items()):
|
| 505 |
+
# add the config_name field in first position
|
| 506 |
+
dataset_info_yaml_dict.pop("config_name", None)
|
| 507 |
+
dataset_info_yaml_dict = {"config_name": config_name, **dataset_info_yaml_dict}
|
| 508 |
+
dataset_card_data["dataset_info"].append(dataset_info_yaml_dict)
|
| 509 |
+
|
| 510 |
+
|
| 511 |
+
@dataclass
|
| 512 |
+
class MetricInfo:
|
| 513 |
+
"""Information about a metric.
|
| 514 |
+
|
| 515 |
+
`MetricInfo` documents a metric, including its name, version, and features.
|
| 516 |
+
See the constructor arguments and properties for a full list.
|
| 517 |
+
|
| 518 |
+
Note: Not all fields are known on construction and may be updated later.
|
| 519 |
+
"""
|
| 520 |
+
|
| 521 |
+
# Set in the dataset scripts
|
| 522 |
+
description: str
|
| 523 |
+
citation: str
|
| 524 |
+
features: Features
|
| 525 |
+
inputs_description: str = dataclasses.field(default_factory=str)
|
| 526 |
+
homepage: str = dataclasses.field(default_factory=str)
|
| 527 |
+
license: str = dataclasses.field(default_factory=str)
|
| 528 |
+
codebase_urls: List[str] = dataclasses.field(default_factory=list)
|
| 529 |
+
reference_urls: List[str] = dataclasses.field(default_factory=list)
|
| 530 |
+
streamable: bool = False
|
| 531 |
+
format: Optional[str] = None
|
| 532 |
+
|
| 533 |
+
# Set later by the builder
|
| 534 |
+
metric_name: Optional[str] = None
|
| 535 |
+
config_name: Optional[str] = None
|
| 536 |
+
experiment_id: Optional[str] = None
|
| 537 |
+
|
| 538 |
+
def __post_init__(self):
|
| 539 |
+
if self.format is not None:
|
| 540 |
+
for key, value in self.features.items():
|
| 541 |
+
if not isinstance(value, Value):
|
| 542 |
+
raise ValueError(
|
| 543 |
+
f"When using 'numpy' format, all features should be a `datasets.Value` feature. "
|
| 544 |
+
f"Here {key} is an instance of {value.__class__.__name__}"
|
| 545 |
+
)
|
| 546 |
+
|
| 547 |
+
def write_to_directory(self, metric_info_dir, pretty_print=False):
|
| 548 |
+
"""Write `MetricInfo` as JSON to `metric_info_dir`.
|
| 549 |
+
Also save the license separately in LICENCE.
|
| 550 |
+
If `pretty_print` is True, the JSON will be pretty-printed with the indent level of 4.
|
| 551 |
+
|
| 552 |
+
Example:
|
| 553 |
+
|
| 554 |
+
```py
|
| 555 |
+
>>> from datasets import load_metric
|
| 556 |
+
>>> metric = load_metric("accuracy")
|
| 557 |
+
>>> metric.info.write_to_directory("/path/to/directory/")
|
| 558 |
+
```
|
| 559 |
+
"""
|
| 560 |
+
with open(os.path.join(metric_info_dir, config.METRIC_INFO_FILENAME), "w", encoding="utf-8") as f:
|
| 561 |
+
json.dump(asdict(self), f, indent=4 if pretty_print else None)
|
| 562 |
+
|
| 563 |
+
if self.license:
|
| 564 |
+
with open(os.path.join(metric_info_dir, config.LICENSE_FILENAME), "w", encoding="utf-8") as f:
|
| 565 |
+
f.write(self.license)
|
| 566 |
+
|
| 567 |
+
@classmethod
|
| 568 |
+
def from_directory(cls, metric_info_dir) -> "MetricInfo":
|
| 569 |
+
"""Create MetricInfo from the JSON file in `metric_info_dir`.
|
| 570 |
+
|
| 571 |
+
Args:
|
| 572 |
+
metric_info_dir: `str` The directory containing the metadata file. This
|
| 573 |
+
should be the root directory of a specific dataset version.
|
| 574 |
+
|
| 575 |
+
Example:
|
| 576 |
+
|
| 577 |
+
```py
|
| 578 |
+
>>> from datasets import MetricInfo
|
| 579 |
+
>>> metric_info = MetricInfo.from_directory("/path/to/directory/")
|
| 580 |
+
```
|
| 581 |
+
"""
|
| 582 |
+
logger.info(f"Loading Metric info from {metric_info_dir}")
|
| 583 |
+
if not metric_info_dir:
|
| 584 |
+
raise ValueError("Calling MetricInfo.from_directory() with undefined metric_info_dir.")
|
| 585 |
+
|
| 586 |
+
with open(os.path.join(metric_info_dir, config.METRIC_INFO_FILENAME), encoding="utf-8") as f:
|
| 587 |
+
metric_info_dict = json.load(f)
|
| 588 |
+
return cls.from_dict(metric_info_dict)
|
| 589 |
+
|
| 590 |
+
@classmethod
|
| 591 |
+
def from_dict(cls, metric_info_dict: dict) -> "MetricInfo":
|
| 592 |
+
field_names = {f.name for f in dataclasses.fields(cls)}
|
| 593 |
+
return cls(**{k: v for k, v in metric_info_dict.items() if k in field_names})
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/iterable_dataset.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/metric.py
ADDED
|
@@ -0,0 +1,652 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2020 The HuggingFace Datasets Authors
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
# Lint as: python3
|
| 16 |
+
"""Metrics base class."""
|
| 17 |
+
|
| 18 |
+
import os
|
| 19 |
+
import types
|
| 20 |
+
import uuid
|
| 21 |
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
| 22 |
+
|
| 23 |
+
import numpy as np
|
| 24 |
+
import pyarrow as pa
|
| 25 |
+
from filelock import BaseFileLock, Timeout
|
| 26 |
+
|
| 27 |
+
from . import config
|
| 28 |
+
from .arrow_dataset import Dataset
|
| 29 |
+
from .arrow_reader import ArrowReader
|
| 30 |
+
from .arrow_writer import ArrowWriter
|
| 31 |
+
from .download.download_config import DownloadConfig
|
| 32 |
+
from .download.download_manager import DownloadManager
|
| 33 |
+
from .features import Features
|
| 34 |
+
from .info import DatasetInfo, MetricInfo
|
| 35 |
+
from .naming import camelcase_to_snakecase
|
| 36 |
+
from .utils._filelock import FileLock
|
| 37 |
+
from .utils.deprecation_utils import deprecated
|
| 38 |
+
from .utils.logging import get_logger
|
| 39 |
+
from .utils.py_utils import copyfunc, temp_seed
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
logger = get_logger(__name__)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class FileFreeLock(BaseFileLock):
|
| 46 |
+
"""Thread lock until a file **cannot** be locked"""
|
| 47 |
+
|
| 48 |
+
def __init__(self, lock_file, *args, **kwargs):
|
| 49 |
+
self.filelock = FileLock(lock_file)
|
| 50 |
+
super().__init__(self.filelock.lock_file, *args, **kwargs)
|
| 51 |
+
|
| 52 |
+
def _acquire(self):
|
| 53 |
+
try:
|
| 54 |
+
self.filelock.acquire(timeout=0.01, poll_intervall=0.02) # Try to lock once
|
| 55 |
+
except Timeout:
|
| 56 |
+
# We couldn't acquire the lock, the file is locked!
|
| 57 |
+
self._context.lock_file_fd = self.filelock.lock_file
|
| 58 |
+
else:
|
| 59 |
+
# We were able to acquire the lock, the file is not yet locked!
|
| 60 |
+
self.filelock.release()
|
| 61 |
+
self._context.lock_file_fd = None
|
| 62 |
+
|
| 63 |
+
def _release(self):
|
| 64 |
+
self._context.lock_file_fd = None
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
# lists - summarize long lists similarly to NumPy
|
| 68 |
+
# arrays/tensors - let the frameworks control formatting
|
| 69 |
+
def summarize_if_long_list(obj):
|
| 70 |
+
if not type(obj) == list or len(obj) <= 6: # noqa: E721
|
| 71 |
+
return f"{obj}"
|
| 72 |
+
|
| 73 |
+
def format_chunk(chunk):
|
| 74 |
+
return ", ".join(repr(x) for x in chunk)
|
| 75 |
+
|
| 76 |
+
return f"[{format_chunk(obj[:3])}, ..., {format_chunk(obj[-3:])}]"
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
class MetricInfoMixin:
|
| 80 |
+
"""This base class exposes some attributes of MetricInfo
|
| 81 |
+
at the base level of the Metric for easy access.
|
| 82 |
+
|
| 83 |
+
<Deprecated version="2.5.0">
|
| 84 |
+
|
| 85 |
+
Use the new library 🤗 Evaluate instead: https://huggingface.co/docs/evaluate
|
| 86 |
+
|
| 87 |
+
</Deprecated>
|
| 88 |
+
|
| 89 |
+
"""
|
| 90 |
+
|
| 91 |
+
def __init__(self, info: MetricInfo):
|
| 92 |
+
self._metric_info = info
|
| 93 |
+
|
| 94 |
+
@property
|
| 95 |
+
def info(self):
|
| 96 |
+
""":class:`datasets.MetricInfo` object containing all the metadata in the metric."""
|
| 97 |
+
return self._metric_info
|
| 98 |
+
|
| 99 |
+
@property
|
| 100 |
+
def name(self) -> str:
|
| 101 |
+
return self._metric_info.metric_name
|
| 102 |
+
|
| 103 |
+
@property
|
| 104 |
+
def experiment_id(self) -> Optional[str]:
|
| 105 |
+
return self._metric_info.experiment_id
|
| 106 |
+
|
| 107 |
+
@property
|
| 108 |
+
def description(self) -> str:
|
| 109 |
+
return self._metric_info.description
|
| 110 |
+
|
| 111 |
+
@property
|
| 112 |
+
def citation(self) -> str:
|
| 113 |
+
return self._metric_info.citation
|
| 114 |
+
|
| 115 |
+
@property
|
| 116 |
+
def features(self) -> Features:
|
| 117 |
+
return self._metric_info.features
|
| 118 |
+
|
| 119 |
+
@property
|
| 120 |
+
def inputs_description(self) -> str:
|
| 121 |
+
return self._metric_info.inputs_description
|
| 122 |
+
|
| 123 |
+
@property
|
| 124 |
+
def homepage(self) -> Optional[str]:
|
| 125 |
+
return self._metric_info.homepage
|
| 126 |
+
|
| 127 |
+
@property
|
| 128 |
+
def license(self) -> str:
|
| 129 |
+
return self._metric_info.license
|
| 130 |
+
|
| 131 |
+
@property
|
| 132 |
+
def codebase_urls(self) -> Optional[List[str]]:
|
| 133 |
+
return self._metric_info.codebase_urls
|
| 134 |
+
|
| 135 |
+
@property
|
| 136 |
+
def reference_urls(self) -> Optional[List[str]]:
|
| 137 |
+
return self._metric_info.reference_urls
|
| 138 |
+
|
| 139 |
+
@property
|
| 140 |
+
def streamable(self) -> bool:
|
| 141 |
+
return self._metric_info.streamable
|
| 142 |
+
|
| 143 |
+
@property
|
| 144 |
+
def format(self) -> Optional[str]:
|
| 145 |
+
return self._metric_info.format
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
class Metric(MetricInfoMixin):
|
| 149 |
+
"""A Metric is the base class and common API for all metrics.
|
| 150 |
+
|
| 151 |
+
<Deprecated version="2.5.0">
|
| 152 |
+
|
| 153 |
+
Use the new library 🤗 Evaluate instead: https://huggingface.co/docs/evaluate
|
| 154 |
+
|
| 155 |
+
</Deprecated>
|
| 156 |
+
|
| 157 |
+
Args:
|
| 158 |
+
config_name (``str``): This is used to define a hash specific to a metrics computation script and prevents the metric's data
|
| 159 |
+
to be overridden when the metric loading script is modified.
|
| 160 |
+
keep_in_memory (:obj:`bool`): keep all predictions and references in memory. Not possible in distributed settings.
|
| 161 |
+
cache_dir (``str``): Path to a directory in which temporary prediction/references data will be stored.
|
| 162 |
+
The data directory should be located on a shared file-system in distributed setups.
|
| 163 |
+
num_process (``int``): specify the total number of nodes in a distributed settings.
|
| 164 |
+
This is useful to compute metrics in distributed setups (in particular non-additive metrics like F1).
|
| 165 |
+
process_id (``int``): specify the id of the current process in a distributed setup (between 0 and num_process-1)
|
| 166 |
+
This is useful to compute metrics in distributed setups (in particular non-additive metrics like F1).
|
| 167 |
+
seed (:obj:`int`, optional): If specified, this will temporarily set numpy's random seed when :func:`datasets.Metric.compute` is run.
|
| 168 |
+
experiment_id (``str``): A specific experiment id. This is used if several distributed evaluations share the same file system.
|
| 169 |
+
This is useful to compute metrics in distributed setups (in particular non-additive metrics like F1).
|
| 170 |
+
max_concurrent_cache_files (``int``): Max number of concurrent metrics cache files (default 10000).
|
| 171 |
+
timeout (``Union[int, float]``): Timeout in second for distributed setting synchronization.
|
| 172 |
+
"""
|
| 173 |
+
|
| 174 |
+
@deprecated("Use the new library 🤗 Evaluate instead: https://huggingface.co/docs/evaluate")
|
| 175 |
+
def __init__(
|
| 176 |
+
self,
|
| 177 |
+
config_name: Optional[str] = None,
|
| 178 |
+
keep_in_memory: bool = False,
|
| 179 |
+
cache_dir: Optional[str] = None,
|
| 180 |
+
num_process: int = 1,
|
| 181 |
+
process_id: int = 0,
|
| 182 |
+
seed: Optional[int] = None,
|
| 183 |
+
experiment_id: Optional[str] = None,
|
| 184 |
+
max_concurrent_cache_files: int = 10000,
|
| 185 |
+
timeout: Union[int, float] = 100,
|
| 186 |
+
**kwargs,
|
| 187 |
+
):
|
| 188 |
+
# prepare info
|
| 189 |
+
self.config_name = config_name or "default"
|
| 190 |
+
info = self._info()
|
| 191 |
+
info.metric_name = camelcase_to_snakecase(self.__class__.__name__)
|
| 192 |
+
info.config_name = self.config_name
|
| 193 |
+
info.experiment_id = experiment_id or "default_experiment"
|
| 194 |
+
MetricInfoMixin.__init__(self, info) # For easy access on low level
|
| 195 |
+
|
| 196 |
+
# Safety checks on num_process and process_id
|
| 197 |
+
if not isinstance(process_id, int) or process_id < 0:
|
| 198 |
+
raise ValueError("'process_id' should be a number greater than 0")
|
| 199 |
+
if not isinstance(num_process, int) or num_process <= process_id:
|
| 200 |
+
raise ValueError("'num_process' should be a number greater than process_id")
|
| 201 |
+
if keep_in_memory and num_process != 1:
|
| 202 |
+
raise ValueError("Using 'keep_in_memory' is not possible in distributed setting (num_process > 1).")
|
| 203 |
+
|
| 204 |
+
self.num_process = num_process
|
| 205 |
+
self.process_id = process_id
|
| 206 |
+
self.max_concurrent_cache_files = max_concurrent_cache_files
|
| 207 |
+
|
| 208 |
+
self.keep_in_memory = keep_in_memory
|
| 209 |
+
self._data_dir_root = os.path.expanduser(cache_dir or config.HF_METRICS_CACHE)
|
| 210 |
+
self.data_dir = self._build_data_dir()
|
| 211 |
+
if seed is None:
|
| 212 |
+
_, seed, pos, *_ = np.random.get_state()
|
| 213 |
+
self.seed: int = seed[pos] if pos < 624 else seed[0]
|
| 214 |
+
else:
|
| 215 |
+
self.seed: int = seed
|
| 216 |
+
self.timeout: Union[int, float] = timeout
|
| 217 |
+
|
| 218 |
+
# Update 'compute' and 'add' docstring
|
| 219 |
+
# methods need to be copied otherwise it changes the docstrings of every instance
|
| 220 |
+
self.compute = types.MethodType(copyfunc(self.compute), self)
|
| 221 |
+
self.add_batch = types.MethodType(copyfunc(self.add_batch), self)
|
| 222 |
+
self.add = types.MethodType(copyfunc(self.add), self)
|
| 223 |
+
self.compute.__func__.__doc__ += self.info.inputs_description
|
| 224 |
+
self.add_batch.__func__.__doc__ += self.info.inputs_description
|
| 225 |
+
self.add.__func__.__doc__ += self.info.inputs_description
|
| 226 |
+
|
| 227 |
+
# self.arrow_schema = pa.schema(field for field in self.info.features.type)
|
| 228 |
+
self.buf_writer = None
|
| 229 |
+
self.writer = None
|
| 230 |
+
self.writer_batch_size = None
|
| 231 |
+
self.data = None
|
| 232 |
+
|
| 233 |
+
# This is the cache file we store our predictions/references in
|
| 234 |
+
# Keep it None for now so we can (cloud)pickle the object
|
| 235 |
+
self.cache_file_name = None
|
| 236 |
+
self.filelock = None
|
| 237 |
+
self.rendez_vous_lock = None
|
| 238 |
+
|
| 239 |
+
# This is all the cache files on which we have a lock when we are in a distributed setting
|
| 240 |
+
self.file_paths = None
|
| 241 |
+
self.filelocks = None
|
| 242 |
+
|
| 243 |
+
def __len__(self):
|
| 244 |
+
"""Return the number of examples (predictions or predictions/references pair)
|
| 245 |
+
currently stored in the metric's cache.
|
| 246 |
+
"""
|
| 247 |
+
return 0 if self.writer is None else len(self.writer)
|
| 248 |
+
|
| 249 |
+
def __repr__(self):
|
| 250 |
+
return (
|
| 251 |
+
f'Metric(name: "{self.name}", features: {self.features}, '
|
| 252 |
+
f'usage: """{self.inputs_description}""", '
|
| 253 |
+
f"stored examples: {len(self)})"
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
def _build_data_dir(self):
|
| 257 |
+
"""Path of this metric in cache_dir:
|
| 258 |
+
Will be:
|
| 259 |
+
self._data_dir_root/self.name/self.config_name/self.hash (if not none)/
|
| 260 |
+
If any of these element is missing or if ``with_version=False`` the corresponding subfolders are dropped.
|
| 261 |
+
"""
|
| 262 |
+
builder_data_dir = self._data_dir_root
|
| 263 |
+
builder_data_dir = os.path.join(builder_data_dir, self.name, self.config_name)
|
| 264 |
+
os.makedirs(builder_data_dir, exist_ok=True)
|
| 265 |
+
return builder_data_dir
|
| 266 |
+
|
| 267 |
+
def _create_cache_file(self, timeout=1) -> Tuple[str, FileLock]:
|
| 268 |
+
"""Create a new cache file. If the default cache file is used, we generated a new hash."""
|
| 269 |
+
file_path = os.path.join(self.data_dir, f"{self.experiment_id}-{self.num_process}-{self.process_id}.arrow")
|
| 270 |
+
filelock = None
|
| 271 |
+
for i in range(self.max_concurrent_cache_files):
|
| 272 |
+
filelock = FileLock(file_path + ".lock")
|
| 273 |
+
try:
|
| 274 |
+
filelock.acquire(timeout=timeout)
|
| 275 |
+
except Timeout:
|
| 276 |
+
# If we have reached the max number of attempts or we are not allow to find a free name (distributed setup)
|
| 277 |
+
# We raise an error
|
| 278 |
+
if self.num_process != 1:
|
| 279 |
+
raise ValueError(
|
| 280 |
+
f"Error in _create_cache_file: another metric instance is already using the local cache file at {file_path}. "
|
| 281 |
+
f"Please specify an experiment_id (currently: {self.experiment_id}) to avoid collision "
|
| 282 |
+
f"between distributed metric instances."
|
| 283 |
+
) from None
|
| 284 |
+
if i == self.max_concurrent_cache_files - 1:
|
| 285 |
+
raise ValueError(
|
| 286 |
+
f"Cannot acquire lock, too many metric instance are operating concurrently on this file system."
|
| 287 |
+
f"You should set a larger value of max_concurrent_cache_files when creating the metric "
|
| 288 |
+
f"(current value is {self.max_concurrent_cache_files})."
|
| 289 |
+
) from None
|
| 290 |
+
# In other cases (allow to find new file name + not yet at max num of attempts) we can try to sample a new hashing name.
|
| 291 |
+
file_uuid = str(uuid.uuid4())
|
| 292 |
+
file_path = os.path.join(
|
| 293 |
+
self.data_dir, f"{self.experiment_id}-{file_uuid}-{self.num_process}-{self.process_id}.arrow"
|
| 294 |
+
)
|
| 295 |
+
else:
|
| 296 |
+
break
|
| 297 |
+
|
| 298 |
+
return file_path, filelock
|
| 299 |
+
|
| 300 |
+
def _get_all_cache_files(self) -> Tuple[List[str], List[FileLock]]:
|
| 301 |
+
"""Get a lock on all the cache files in a distributed setup.
|
| 302 |
+
We wait for timeout second to let all the distributed node finish their tasks (default is 100 seconds).
|
| 303 |
+
"""
|
| 304 |
+
if self.num_process == 1:
|
| 305 |
+
if self.cache_file_name is None:
|
| 306 |
+
raise ValueError(
|
| 307 |
+
"Metric cache file doesn't exist. Please make sure that you call `add` or `add_batch` "
|
| 308 |
+
"at least once before calling `compute`."
|
| 309 |
+
)
|
| 310 |
+
file_paths = [self.cache_file_name]
|
| 311 |
+
else:
|
| 312 |
+
file_paths = [
|
| 313 |
+
os.path.join(self.data_dir, f"{self.experiment_id}-{self.num_process}-{process_id}.arrow")
|
| 314 |
+
for process_id in range(self.num_process)
|
| 315 |
+
]
|
| 316 |
+
|
| 317 |
+
# Let's acquire a lock on each process files to be sure they are finished writing
|
| 318 |
+
filelocks = []
|
| 319 |
+
for process_id, file_path in enumerate(file_paths):
|
| 320 |
+
if process_id == 0: # process 0 already has its lock file
|
| 321 |
+
filelocks.append(self.filelock)
|
| 322 |
+
else:
|
| 323 |
+
filelock = FileLock(file_path + ".lock")
|
| 324 |
+
try:
|
| 325 |
+
filelock.acquire(timeout=self.timeout)
|
| 326 |
+
except Timeout:
|
| 327 |
+
raise ValueError(
|
| 328 |
+
f"Cannot acquire lock on cached file {file_path} for process {process_id}."
|
| 329 |
+
) from None
|
| 330 |
+
else:
|
| 331 |
+
filelocks.append(filelock)
|
| 332 |
+
|
| 333 |
+
return file_paths, filelocks
|
| 334 |
+
|
| 335 |
+
def _check_all_processes_locks(self):
|
| 336 |
+
expected_lock_file_names = [
|
| 337 |
+
os.path.join(self.data_dir, f"{self.experiment_id}-{self.num_process}-{process_id}.arrow.lock")
|
| 338 |
+
for process_id in range(self.num_process)
|
| 339 |
+
]
|
| 340 |
+
for expected_lock_file_name in expected_lock_file_names:
|
| 341 |
+
nofilelock = FileFreeLock(expected_lock_file_name)
|
| 342 |
+
try:
|
| 343 |
+
nofilelock.acquire(timeout=self.timeout)
|
| 344 |
+
except Timeout:
|
| 345 |
+
raise ValueError(
|
| 346 |
+
f"Expected to find locked file {expected_lock_file_name} from process {self.process_id} but it doesn't exist."
|
| 347 |
+
) from None
|
| 348 |
+
else:
|
| 349 |
+
nofilelock.release()
|
| 350 |
+
|
| 351 |
+
def _check_rendez_vous(self):
|
| 352 |
+
expected_lock_file_name = os.path.join(self.data_dir, f"{self.experiment_id}-{self.num_process}-0.arrow.lock")
|
| 353 |
+
nofilelock = FileFreeLock(expected_lock_file_name)
|
| 354 |
+
try:
|
| 355 |
+
nofilelock.acquire(timeout=self.timeout)
|
| 356 |
+
except Timeout:
|
| 357 |
+
raise ValueError(
|
| 358 |
+
f"Expected to find locked file {expected_lock_file_name} from process {self.process_id} but it doesn't exist."
|
| 359 |
+
) from None
|
| 360 |
+
else:
|
| 361 |
+
nofilelock.release()
|
| 362 |
+
lock_file_name = os.path.join(self.data_dir, f"{self.experiment_id}-{self.num_process}-rdv.lock")
|
| 363 |
+
rendez_vous_lock = FileLock(lock_file_name)
|
| 364 |
+
try:
|
| 365 |
+
rendez_vous_lock.acquire(timeout=self.timeout)
|
| 366 |
+
except Timeout:
|
| 367 |
+
raise ValueError(f"Couldn't acquire lock on {lock_file_name} from process {self.process_id}.") from None
|
| 368 |
+
else:
|
| 369 |
+
rendez_vous_lock.release()
|
| 370 |
+
|
| 371 |
+
def _finalize(self):
|
| 372 |
+
"""Close all the writing process and load/gather the data
|
| 373 |
+
from all the nodes if main node or all_process is True.
|
| 374 |
+
"""
|
| 375 |
+
if self.writer is not None:
|
| 376 |
+
self.writer.finalize()
|
| 377 |
+
self.writer = None
|
| 378 |
+
# release the locks of the processes > 0 so that process 0 can lock them to read + delete the data
|
| 379 |
+
if self.filelock is not None and self.process_id > 0:
|
| 380 |
+
self.filelock.release()
|
| 381 |
+
|
| 382 |
+
if self.keep_in_memory:
|
| 383 |
+
# Read the predictions and references
|
| 384 |
+
reader = ArrowReader(path=self.data_dir, info=DatasetInfo(features=self.features))
|
| 385 |
+
self.data = Dataset.from_buffer(self.buf_writer.getvalue())
|
| 386 |
+
|
| 387 |
+
elif self.process_id == 0:
|
| 388 |
+
# Let's acquire a lock on each node files to be sure they are finished writing
|
| 389 |
+
file_paths, filelocks = self._get_all_cache_files()
|
| 390 |
+
|
| 391 |
+
# Read the predictions and references
|
| 392 |
+
try:
|
| 393 |
+
reader = ArrowReader(path="", info=DatasetInfo(features=self.features))
|
| 394 |
+
self.data = Dataset(**reader.read_files([{"filename": f} for f in file_paths]))
|
| 395 |
+
except FileNotFoundError:
|
| 396 |
+
raise ValueError(
|
| 397 |
+
"Error in finalize: another metric instance is already using the local cache file. "
|
| 398 |
+
"Please specify an experiment_id to avoid collision between distributed metric instances."
|
| 399 |
+
) from None
|
| 400 |
+
|
| 401 |
+
# Store file paths and locks and we will release/delete them after the computation.
|
| 402 |
+
self.file_paths = file_paths
|
| 403 |
+
self.filelocks = filelocks
|
| 404 |
+
|
| 405 |
+
def compute(self, *, predictions=None, references=None, **kwargs) -> Optional[dict]:
|
| 406 |
+
"""Compute the metrics.
|
| 407 |
+
|
| 408 |
+
Usage of positional arguments is not allowed to prevent mistakes.
|
| 409 |
+
|
| 410 |
+
Args:
|
| 411 |
+
predictions (list/array/tensor, optional): Predictions.
|
| 412 |
+
references (list/array/tensor, optional): References.
|
| 413 |
+
**kwargs (optional): Keyword arguments that will be forwarded to the metrics :meth:`_compute`
|
| 414 |
+
method (see details in the docstring).
|
| 415 |
+
|
| 416 |
+
Return:
|
| 417 |
+
dict or None
|
| 418 |
+
|
| 419 |
+
- Dictionary with the metrics if this metric is run on the main process (``process_id == 0``).
|
| 420 |
+
- None if the metric is not run on the main process (``process_id != 0``).
|
| 421 |
+
|
| 422 |
+
Example:
|
| 423 |
+
|
| 424 |
+
```py
|
| 425 |
+
>>> from datasets import load_metric
|
| 426 |
+
>>> metric = load_metric("accuracy")
|
| 427 |
+
>>> accuracy = metric.compute(predictions=model_prediction, references=labels)
|
| 428 |
+
```
|
| 429 |
+
"""
|
| 430 |
+
all_kwargs = {"predictions": predictions, "references": references, **kwargs}
|
| 431 |
+
if predictions is None and references is None:
|
| 432 |
+
missing_kwargs = {k: None for k in self.features if k not in all_kwargs}
|
| 433 |
+
all_kwargs.update(missing_kwargs)
|
| 434 |
+
else:
|
| 435 |
+
missing_inputs = [k for k in self.features if k not in all_kwargs]
|
| 436 |
+
if missing_inputs:
|
| 437 |
+
raise ValueError(
|
| 438 |
+
f"Metric inputs are missing: {missing_inputs}. All required inputs are {list(self.features)}"
|
| 439 |
+
)
|
| 440 |
+
inputs = {input_name: all_kwargs[input_name] for input_name in self.features}
|
| 441 |
+
compute_kwargs = {k: kwargs[k] for k in kwargs if k not in self.features}
|
| 442 |
+
|
| 443 |
+
if any(v is not None for v in inputs.values()):
|
| 444 |
+
self.add_batch(**inputs)
|
| 445 |
+
self._finalize()
|
| 446 |
+
|
| 447 |
+
self.cache_file_name = None
|
| 448 |
+
self.filelock = None
|
| 449 |
+
|
| 450 |
+
if self.process_id == 0:
|
| 451 |
+
self.data.set_format(type=self.info.format)
|
| 452 |
+
|
| 453 |
+
inputs = {input_name: self.data[input_name] for input_name in self.features}
|
| 454 |
+
with temp_seed(self.seed):
|
| 455 |
+
output = self._compute(**inputs, **compute_kwargs)
|
| 456 |
+
|
| 457 |
+
if self.buf_writer is not None:
|
| 458 |
+
self.buf_writer = None
|
| 459 |
+
del self.data
|
| 460 |
+
self.data = None
|
| 461 |
+
else:
|
| 462 |
+
# Release locks and delete all the cache files. Process 0 is released last.
|
| 463 |
+
for filelock, file_path in reversed(list(zip(self.filelocks, self.file_paths))):
|
| 464 |
+
logger.info(f"Removing {file_path}")
|
| 465 |
+
del self.data
|
| 466 |
+
self.data = None
|
| 467 |
+
del self.writer
|
| 468 |
+
self.writer = None
|
| 469 |
+
os.remove(file_path)
|
| 470 |
+
filelock.release()
|
| 471 |
+
|
| 472 |
+
return output
|
| 473 |
+
else:
|
| 474 |
+
return None
|
| 475 |
+
|
| 476 |
+
def add_batch(self, *, predictions=None, references=None, **kwargs):
|
| 477 |
+
"""Add a batch of predictions and references for the metric's stack.
|
| 478 |
+
|
| 479 |
+
Args:
|
| 480 |
+
predictions (list/array/tensor, optional): Predictions.
|
| 481 |
+
references (list/array/tensor, optional): References.
|
| 482 |
+
|
| 483 |
+
Example:
|
| 484 |
+
|
| 485 |
+
```py
|
| 486 |
+
>>> from datasets import load_metric
|
| 487 |
+
>>> metric = load_metric("accuracy")
|
| 488 |
+
>>> metric.add_batch(predictions=model_prediction, references=labels)
|
| 489 |
+
```
|
| 490 |
+
"""
|
| 491 |
+
bad_inputs = [input_name for input_name in kwargs if input_name not in self.features]
|
| 492 |
+
if bad_inputs:
|
| 493 |
+
raise ValueError(f"Bad inputs for metric: {bad_inputs}. All required inputs are {list(self.features)}")
|
| 494 |
+
batch = {"predictions": predictions, "references": references, **kwargs}
|
| 495 |
+
batch = {intput_name: batch[intput_name] for intput_name in self.features}
|
| 496 |
+
batch = self.info.features.encode_batch(batch)
|
| 497 |
+
if self.writer is None:
|
| 498 |
+
self._init_writer()
|
| 499 |
+
try:
|
| 500 |
+
self.writer.write_batch(batch)
|
| 501 |
+
except pa.ArrowInvalid:
|
| 502 |
+
if any(len(batch[c]) != len(next(iter(batch.values()))) for c in batch):
|
| 503 |
+
col0 = next(iter(batch))
|
| 504 |
+
bad_col = [c for c in batch if len(batch[c]) != len(batch[col0])][0]
|
| 505 |
+
error_msg = (
|
| 506 |
+
f"Mismatch in the number of {col0} ({len(batch[col0])}) and {bad_col} ({len(batch[bad_col])})"
|
| 507 |
+
)
|
| 508 |
+
elif sorted(self.features) != ["references", "predictions"]:
|
| 509 |
+
error_msg = f"Metric inputs don't match the expected format.\n" f"Expected format: {self.features},\n"
|
| 510 |
+
error_msg_inputs = ",\n".join(
|
| 511 |
+
f"Input {input_name}: {summarize_if_long_list(batch[input_name])}" for input_name in self.features
|
| 512 |
+
)
|
| 513 |
+
error_msg += error_msg_inputs
|
| 514 |
+
else:
|
| 515 |
+
error_msg = (
|
| 516 |
+
f"Predictions and/or references don't match the expected format.\n"
|
| 517 |
+
f"Expected format: {self.features},\n"
|
| 518 |
+
f"Input predictions: {summarize_if_long_list(predictions)},\n"
|
| 519 |
+
f"Input references: {summarize_if_long_list(references)}"
|
| 520 |
+
)
|
| 521 |
+
raise ValueError(error_msg) from None
|
| 522 |
+
|
| 523 |
+
def add(self, *, prediction=None, reference=None, **kwargs):
|
| 524 |
+
"""Add one prediction and reference for the metric's stack.
|
| 525 |
+
|
| 526 |
+
Args:
|
| 527 |
+
prediction (list/array/tensor, optional): Predictions.
|
| 528 |
+
reference (list/array/tensor, optional): References.
|
| 529 |
+
|
| 530 |
+
Example:
|
| 531 |
+
|
| 532 |
+
```py
|
| 533 |
+
>>> from datasets import load_metric
|
| 534 |
+
>>> metric = load_metric("accuracy")
|
| 535 |
+
>>> metric.add(predictions=model_predictions, references=labels)
|
| 536 |
+
```
|
| 537 |
+
"""
|
| 538 |
+
bad_inputs = [input_name for input_name in kwargs if input_name not in self.features]
|
| 539 |
+
if bad_inputs:
|
| 540 |
+
raise ValueError(f"Bad inputs for metric: {bad_inputs}. All required inputs are {list(self.features)}")
|
| 541 |
+
example = {"predictions": prediction, "references": reference, **kwargs}
|
| 542 |
+
example = {intput_name: example[intput_name] for intput_name in self.features}
|
| 543 |
+
example = self.info.features.encode_example(example)
|
| 544 |
+
if self.writer is None:
|
| 545 |
+
self._init_writer()
|
| 546 |
+
try:
|
| 547 |
+
self.writer.write(example)
|
| 548 |
+
except pa.ArrowInvalid:
|
| 549 |
+
error_msg = f"Metric inputs don't match the expected format.\n" f"Expected format: {self.features},\n"
|
| 550 |
+
error_msg_inputs = ",\n".join(
|
| 551 |
+
f"Input {input_name}: {summarize_if_long_list(example[input_name])}" for input_name in self.features
|
| 552 |
+
)
|
| 553 |
+
error_msg += error_msg_inputs
|
| 554 |
+
raise ValueError(error_msg) from None
|
| 555 |
+
|
| 556 |
+
def _init_writer(self, timeout=1):
|
| 557 |
+
if self.num_process > 1:
|
| 558 |
+
if self.process_id == 0:
|
| 559 |
+
file_path = os.path.join(self.data_dir, f"{self.experiment_id}-{self.num_process}-rdv.lock")
|
| 560 |
+
self.rendez_vous_lock = FileLock(file_path)
|
| 561 |
+
try:
|
| 562 |
+
self.rendez_vous_lock.acquire(timeout=timeout)
|
| 563 |
+
except TimeoutError:
|
| 564 |
+
raise ValueError(
|
| 565 |
+
f"Error in _init_writer: another metric instance is already using the local cache file at {file_path}. "
|
| 566 |
+
f"Please specify an experiment_id (currently: {self.experiment_id}) to avoid collision "
|
| 567 |
+
f"between distributed metric instances."
|
| 568 |
+
) from None
|
| 569 |
+
|
| 570 |
+
if self.keep_in_memory:
|
| 571 |
+
self.buf_writer = pa.BufferOutputStream()
|
| 572 |
+
self.writer = ArrowWriter(
|
| 573 |
+
features=self.info.features, stream=self.buf_writer, writer_batch_size=self.writer_batch_size
|
| 574 |
+
)
|
| 575 |
+
else:
|
| 576 |
+
self.buf_writer = None
|
| 577 |
+
|
| 578 |
+
# Get cache file name and lock it
|
| 579 |
+
if self.cache_file_name is None or self.filelock is None:
|
| 580 |
+
cache_file_name, filelock = self._create_cache_file() # get ready
|
| 581 |
+
self.cache_file_name = cache_file_name
|
| 582 |
+
self.filelock = filelock
|
| 583 |
+
|
| 584 |
+
self.writer = ArrowWriter(
|
| 585 |
+
features=self.info.features, path=self.cache_file_name, writer_batch_size=self.writer_batch_size
|
| 586 |
+
)
|
| 587 |
+
# Setup rendez-vous here if
|
| 588 |
+
if self.num_process > 1:
|
| 589 |
+
if self.process_id == 0:
|
| 590 |
+
self._check_all_processes_locks() # wait for everyone to be ready
|
| 591 |
+
self.rendez_vous_lock.release() # let everyone go
|
| 592 |
+
else:
|
| 593 |
+
self._check_rendez_vous() # wait for master to be ready and to let everyone go
|
| 594 |
+
|
| 595 |
+
def _info(self) -> MetricInfo:
|
| 596 |
+
"""Construct the MetricInfo object. See `MetricInfo` for details.
|
| 597 |
+
|
| 598 |
+
Warning: This function is only called once and the result is cached for all
|
| 599 |
+
following .info() calls.
|
| 600 |
+
|
| 601 |
+
Returns:
|
| 602 |
+
info: (MetricInfo) The metrics information
|
| 603 |
+
"""
|
| 604 |
+
raise NotImplementedError
|
| 605 |
+
|
| 606 |
+
def download_and_prepare(
|
| 607 |
+
self,
|
| 608 |
+
download_config: Optional[DownloadConfig] = None,
|
| 609 |
+
dl_manager: Optional[DownloadManager] = None,
|
| 610 |
+
):
|
| 611 |
+
"""Downloads and prepares dataset for reading.
|
| 612 |
+
|
| 613 |
+
Args:
|
| 614 |
+
download_config (:class:`DownloadConfig`, optional): Specific download configuration parameters.
|
| 615 |
+
dl_manager (:class:`DownloadManager`, optional): Specific download manager to use.
|
| 616 |
+
"""
|
| 617 |
+
if dl_manager is None:
|
| 618 |
+
if download_config is None:
|
| 619 |
+
download_config = DownloadConfig()
|
| 620 |
+
download_config.cache_dir = os.path.join(self.data_dir, "downloads")
|
| 621 |
+
download_config.force_download = False
|
| 622 |
+
|
| 623 |
+
dl_manager = DownloadManager(
|
| 624 |
+
dataset_name=self.name, download_config=download_config, data_dir=self.data_dir
|
| 625 |
+
)
|
| 626 |
+
|
| 627 |
+
self._download_and_prepare(dl_manager)
|
| 628 |
+
|
| 629 |
+
def _download_and_prepare(self, dl_manager):
|
| 630 |
+
"""Downloads and prepares resources for the metric.
|
| 631 |
+
|
| 632 |
+
This is the internal implementation to overwrite called when user calls
|
| 633 |
+
`download_and_prepare`. It should download all required resources for the metric.
|
| 634 |
+
|
| 635 |
+
Args:
|
| 636 |
+
dl_manager (:class:`DownloadManager`): `DownloadManager` used to download and cache data.
|
| 637 |
+
"""
|
| 638 |
+
return None
|
| 639 |
+
|
| 640 |
+
def _compute(self, *, predictions=None, references=None, **kwargs) -> Dict[str, Any]:
|
| 641 |
+
"""This method defines the common API for all the metrics in the library"""
|
| 642 |
+
raise NotImplementedError
|
| 643 |
+
|
| 644 |
+
def __del__(self):
|
| 645 |
+
if hasattr(self, "filelock") and self.filelock is not None:
|
| 646 |
+
self.filelock.release()
|
| 647 |
+
if hasattr(self, "rendez_vous_lock") and self.rendez_vous_lock is not None:
|
| 648 |
+
self.rendez_vous_lock.release()
|
| 649 |
+
if hasattr(self, "writer"): # in case it was already deleted
|
| 650 |
+
del self.writer
|
| 651 |
+
if hasattr(self, "data"): # in case it was already deleted
|
| 652 |
+
del self.data
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/search.py
ADDED
|
@@ -0,0 +1,785 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import importlib.util
|
| 2 |
+
import os
|
| 3 |
+
import tempfile
|
| 4 |
+
from pathlib import PurePath
|
| 5 |
+
from typing import TYPE_CHECKING, Dict, List, NamedTuple, Optional, Union
|
| 6 |
+
|
| 7 |
+
import fsspec
|
| 8 |
+
import numpy as np
|
| 9 |
+
|
| 10 |
+
from .features import Sequence
|
| 11 |
+
from .utils import logging
|
| 12 |
+
from .utils import tqdm as hf_tqdm
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
if TYPE_CHECKING:
|
| 16 |
+
from .arrow_dataset import Dataset # noqa: F401
|
| 17 |
+
|
| 18 |
+
try:
|
| 19 |
+
from elasticsearch import Elasticsearch # noqa: F401
|
| 20 |
+
|
| 21 |
+
except ImportError:
|
| 22 |
+
pass
|
| 23 |
+
try:
|
| 24 |
+
import faiss # noqa: F401
|
| 25 |
+
|
| 26 |
+
except ImportError:
|
| 27 |
+
pass
|
| 28 |
+
|
| 29 |
+
_has_elasticsearch = importlib.util.find_spec("elasticsearch") is not None
|
| 30 |
+
_has_faiss = importlib.util.find_spec("faiss") is not None
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
logger = logging.get_logger(__name__)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class MissingIndex(Exception):
|
| 37 |
+
pass
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class SearchResults(NamedTuple):
|
| 41 |
+
scores: List[float]
|
| 42 |
+
indices: List[int]
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class BatchedSearchResults(NamedTuple):
|
| 46 |
+
total_scores: List[List[float]]
|
| 47 |
+
total_indices: List[List[int]]
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class NearestExamplesResults(NamedTuple):
|
| 51 |
+
scores: List[float]
|
| 52 |
+
examples: dict
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
class BatchedNearestExamplesResults(NamedTuple):
|
| 56 |
+
total_scores: List[List[float]]
|
| 57 |
+
total_examples: List[dict]
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class BaseIndex:
|
| 61 |
+
"""Base class for indexing"""
|
| 62 |
+
|
| 63 |
+
def search(self, query, k: int = 10, **kwargs) -> SearchResults:
|
| 64 |
+
"""
|
| 65 |
+
To implement.
|
| 66 |
+
This method has to return the scores and the indices of the retrieved examples given a certain query.
|
| 67 |
+
"""
|
| 68 |
+
raise NotImplementedError
|
| 69 |
+
|
| 70 |
+
def search_batch(self, queries, k: int = 10, **kwargs) -> BatchedSearchResults:
|
| 71 |
+
"""Find the nearest examples indices to the query.
|
| 72 |
+
|
| 73 |
+
Args:
|
| 74 |
+
queries (`Union[List[str], np.ndarray]`): The queries as a list of strings if `column` is a text index or as a numpy array if `column` is a vector index.
|
| 75 |
+
k (`int`): The number of examples to retrieve per query.
|
| 76 |
+
|
| 77 |
+
Ouput:
|
| 78 |
+
total_scores (`List[List[float]`): The retrieval scores of the retrieved examples per query.
|
| 79 |
+
total_indices (`List[List[int]]`): The indices of the retrieved examples per query.
|
| 80 |
+
"""
|
| 81 |
+
total_scores, total_indices = [], []
|
| 82 |
+
for query in queries:
|
| 83 |
+
scores, indices = self.search(query, k)
|
| 84 |
+
total_scores.append(scores)
|
| 85 |
+
total_indices.append(indices)
|
| 86 |
+
return BatchedSearchResults(total_scores, total_indices)
|
| 87 |
+
|
| 88 |
+
def save(self, file: Union[str, PurePath]):
|
| 89 |
+
"""Serialize the index on disk"""
|
| 90 |
+
raise NotImplementedError
|
| 91 |
+
|
| 92 |
+
@classmethod
|
| 93 |
+
def load(cls, file: Union[str, PurePath]) -> "BaseIndex":
|
| 94 |
+
"""Deserialize the index from disk"""
|
| 95 |
+
raise NotImplementedError
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
class ElasticSearchIndex(BaseIndex):
|
| 99 |
+
"""
|
| 100 |
+
Sparse index using Elasticsearch. It is used to index text and run queries based on BM25 similarity.
|
| 101 |
+
An Elasticsearch server needs to be accessible, and a python client is declared with
|
| 102 |
+
```
|
| 103 |
+
es_client = Elasticsearch([{'host': 'localhost', 'port': '9200'}])
|
| 104 |
+
```
|
| 105 |
+
for example.
|
| 106 |
+
"""
|
| 107 |
+
|
| 108 |
+
def __init__(
|
| 109 |
+
self,
|
| 110 |
+
host: Optional[str] = None,
|
| 111 |
+
port: Optional[int] = None,
|
| 112 |
+
es_client: Optional["Elasticsearch"] = None,
|
| 113 |
+
es_index_name: Optional[str] = None,
|
| 114 |
+
es_index_config: Optional[dict] = None,
|
| 115 |
+
):
|
| 116 |
+
if not _has_elasticsearch:
|
| 117 |
+
raise ImportError(
|
| 118 |
+
"You must install ElasticSearch to use ElasticSearchIndex. To do so you can run `pip install elasticsearch==7.7.1 for example`"
|
| 119 |
+
)
|
| 120 |
+
if es_client is not None and (host is not None or port is not None):
|
| 121 |
+
raise ValueError("Please specify either `es_client` or `(host, port)`, but not both.")
|
| 122 |
+
host = host or "localhost"
|
| 123 |
+
port = port or 9200
|
| 124 |
+
|
| 125 |
+
import elasticsearch.helpers # noqa: F401 - need this to properly load all the es features
|
| 126 |
+
from elasticsearch import Elasticsearch # noqa: F811
|
| 127 |
+
|
| 128 |
+
self.es_client = es_client if es_client is not None else Elasticsearch([{"host": host, "port": str(port)}])
|
| 129 |
+
self.es_index_name = (
|
| 130 |
+
es_index_name
|
| 131 |
+
if es_index_name is not None
|
| 132 |
+
else "huggingface_datasets_" + os.path.basename(tempfile.NamedTemporaryFile().name)
|
| 133 |
+
)
|
| 134 |
+
self.es_index_config = (
|
| 135 |
+
es_index_config
|
| 136 |
+
if es_index_config is not None
|
| 137 |
+
else {
|
| 138 |
+
"settings": {
|
| 139 |
+
"number_of_shards": 1,
|
| 140 |
+
"analysis": {"analyzer": {"stop_standard": {"type": "standard", " stopwords": "_english_"}}},
|
| 141 |
+
},
|
| 142 |
+
"mappings": {"properties": {"text": {"type": "text", "analyzer": "standard", "similarity": "BM25"}}},
|
| 143 |
+
}
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
def add_documents(self, documents: Union[List[str], "Dataset"], column: Optional[str] = None):
|
| 147 |
+
"""
|
| 148 |
+
Add documents to the index.
|
| 149 |
+
If the documents are inside a certain column, you can specify it using the `column` argument.
|
| 150 |
+
"""
|
| 151 |
+
index_name = self.es_index_name
|
| 152 |
+
index_config = self.es_index_config
|
| 153 |
+
self.es_client.indices.create(index=index_name, body=index_config)
|
| 154 |
+
number_of_docs = len(documents)
|
| 155 |
+
progress = hf_tqdm(unit="docs", total=number_of_docs)
|
| 156 |
+
successes = 0
|
| 157 |
+
|
| 158 |
+
def passage_generator():
|
| 159 |
+
if column is not None:
|
| 160 |
+
for i, example in enumerate(documents):
|
| 161 |
+
yield {"text": example[column], "_id": i}
|
| 162 |
+
else:
|
| 163 |
+
for i, example in enumerate(documents):
|
| 164 |
+
yield {"text": example, "_id": i}
|
| 165 |
+
|
| 166 |
+
# create the ES index
|
| 167 |
+
import elasticsearch as es
|
| 168 |
+
|
| 169 |
+
for ok, action in es.helpers.streaming_bulk(
|
| 170 |
+
client=self.es_client,
|
| 171 |
+
index=index_name,
|
| 172 |
+
actions=passage_generator(),
|
| 173 |
+
):
|
| 174 |
+
progress.update(1)
|
| 175 |
+
successes += ok
|
| 176 |
+
if successes != len(documents):
|
| 177 |
+
logger.warning(
|
| 178 |
+
f"Some documents failed to be added to ElasticSearch. Failures: {len(documents)-successes}/{len(documents)}"
|
| 179 |
+
)
|
| 180 |
+
logger.info(f"Indexed {successes:d} documents")
|
| 181 |
+
|
| 182 |
+
def search(self, query: str, k=10, **kwargs) -> SearchResults:
|
| 183 |
+
"""Find the nearest examples indices to the query.
|
| 184 |
+
|
| 185 |
+
Args:
|
| 186 |
+
query (`str`): The query as a string.
|
| 187 |
+
k (`int`): The number of examples to retrieve.
|
| 188 |
+
|
| 189 |
+
Ouput:
|
| 190 |
+
scores (`List[List[float]`): The retrieval scores of the retrieved examples.
|
| 191 |
+
indices (`List[List[int]]`): The indices of the retrieved examples.
|
| 192 |
+
"""
|
| 193 |
+
response = self.es_client.search(
|
| 194 |
+
index=self.es_index_name,
|
| 195 |
+
body={"query": {"multi_match": {"query": query, "fields": ["text"], "type": "cross_fields"}}, "size": k},
|
| 196 |
+
**kwargs,
|
| 197 |
+
)
|
| 198 |
+
hits = response["hits"]["hits"]
|
| 199 |
+
return SearchResults([hit["_score"] for hit in hits], [int(hit["_id"]) for hit in hits])
|
| 200 |
+
|
| 201 |
+
def search_batch(self, queries, k: int = 10, max_workers=10, **kwargs) -> BatchedSearchResults:
|
| 202 |
+
import concurrent.futures
|
| 203 |
+
|
| 204 |
+
total_scores, total_indices = [None] * len(queries), [None] * len(queries)
|
| 205 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 206 |
+
future_to_index = {executor.submit(self.search, query, k, **kwargs): i for i, query in enumerate(queries)}
|
| 207 |
+
for future in concurrent.futures.as_completed(future_to_index):
|
| 208 |
+
index = future_to_index[future]
|
| 209 |
+
results: SearchResults = future.result()
|
| 210 |
+
total_scores[index] = results.scores
|
| 211 |
+
total_indices[index] = results.indices
|
| 212 |
+
return BatchedSearchResults(total_indices=total_indices, total_scores=total_scores)
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
class FaissIndex(BaseIndex):
|
| 216 |
+
"""
|
| 217 |
+
Dense index using Faiss. It is used to index vectors.
|
| 218 |
+
Faiss is a library for efficient similarity search and clustering of dense vectors.
|
| 219 |
+
It contains algorithms that search in sets of vectors of any size, up to ones that possibly do not fit in RAM.
|
| 220 |
+
You can find more information about Faiss here:
|
| 221 |
+
- For index types and the string factory: https://github.com/facebookresearch/faiss/wiki/The-index-factory
|
| 222 |
+
- For GPU settings: https://github.com/facebookresearch/faiss/wiki/Faiss-on-the-GPU
|
| 223 |
+
"""
|
| 224 |
+
|
| 225 |
+
def __init__(
|
| 226 |
+
self,
|
| 227 |
+
device: Optional[Union[int, List[int]]] = None,
|
| 228 |
+
string_factory: Optional[str] = None,
|
| 229 |
+
metric_type: Optional[int] = None,
|
| 230 |
+
custom_index: Optional["faiss.Index"] = None,
|
| 231 |
+
):
|
| 232 |
+
"""
|
| 233 |
+
Create a Dense index using Faiss. You can specify `device` if you want to run it on GPU (`device` must be the GPU index).
|
| 234 |
+
You can find more information about Faiss here:
|
| 235 |
+
- For `string factory`: https://github.com/facebookresearch/faiss/wiki/The-index-factory
|
| 236 |
+
"""
|
| 237 |
+
if string_factory is not None and custom_index is not None:
|
| 238 |
+
raise ValueError("Please specify either `string_factory` or `custom_index` but not both.")
|
| 239 |
+
if device is not None and custom_index is not None:
|
| 240 |
+
raise ValueError(
|
| 241 |
+
"Cannot pass both 'custom_index' and 'device'. "
|
| 242 |
+
"Pass 'custom_index' already transferred to the target device instead."
|
| 243 |
+
)
|
| 244 |
+
self.device = device
|
| 245 |
+
self.string_factory = string_factory
|
| 246 |
+
self.metric_type = metric_type
|
| 247 |
+
self.faiss_index = custom_index
|
| 248 |
+
if not _has_faiss:
|
| 249 |
+
raise ImportError(
|
| 250 |
+
"You must install Faiss to use FaissIndex. To do so you can run `conda install -c pytorch faiss-cpu` or `conda install -c pytorch faiss-gpu`. "
|
| 251 |
+
"A community supported package is also available on pypi: `pip install faiss-cpu` or `pip install faiss-gpu`. "
|
| 252 |
+
"Note that pip may not have the latest version of FAISS, and thus, some of the latest features and bug fixes may not be available."
|
| 253 |
+
)
|
| 254 |
+
|
| 255 |
+
def add_vectors(
|
| 256 |
+
self,
|
| 257 |
+
vectors: Union[np.array, "Dataset"],
|
| 258 |
+
column: Optional[str] = None,
|
| 259 |
+
batch_size: int = 1000,
|
| 260 |
+
train_size: Optional[int] = None,
|
| 261 |
+
faiss_verbose: Optional[bool] = None,
|
| 262 |
+
):
|
| 263 |
+
"""
|
| 264 |
+
Add vectors to the index.
|
| 265 |
+
If the arrays are inside a certain column, you can specify it using the `column` argument.
|
| 266 |
+
"""
|
| 267 |
+
import faiss # noqa: F811
|
| 268 |
+
|
| 269 |
+
if column and not isinstance(vectors.features[column], Sequence):
|
| 270 |
+
raise ValueError(
|
| 271 |
+
f"Wrong feature type for column '{column}'. Expected 1d array, got {vectors.features[column]}"
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
+
# Create index
|
| 275 |
+
if self.faiss_index is None:
|
| 276 |
+
size = len(vectors[0]) if column is None else len(vectors[0][column])
|
| 277 |
+
if self.string_factory is not None:
|
| 278 |
+
if self.metric_type is None:
|
| 279 |
+
index = faiss.index_factory(size, self.string_factory)
|
| 280 |
+
else:
|
| 281 |
+
index = faiss.index_factory(size, self.string_factory, self.metric_type)
|
| 282 |
+
else:
|
| 283 |
+
if self.metric_type is None:
|
| 284 |
+
index = faiss.IndexFlat(size)
|
| 285 |
+
else:
|
| 286 |
+
index = faiss.IndexFlat(size, self.metric_type)
|
| 287 |
+
|
| 288 |
+
self.faiss_index = self._faiss_index_to_device(index, self.device)
|
| 289 |
+
logger.info(f"Created faiss index of type {type(self.faiss_index)}")
|
| 290 |
+
|
| 291 |
+
# Set verbosity level
|
| 292 |
+
if faiss_verbose is not None:
|
| 293 |
+
self.faiss_index.verbose = faiss_verbose
|
| 294 |
+
if hasattr(self.faiss_index, "index") and self.faiss_index.index is not None:
|
| 295 |
+
self.faiss_index.index.verbose = faiss_verbose
|
| 296 |
+
if hasattr(self.faiss_index, "quantizer") and self.faiss_index.quantizer is not None:
|
| 297 |
+
self.faiss_index.quantizer.verbose = faiss_verbose
|
| 298 |
+
if hasattr(self.faiss_index, "clustering_index") and self.faiss_index.clustering_index is not None:
|
| 299 |
+
self.faiss_index.clustering_index.verbose = faiss_verbose
|
| 300 |
+
|
| 301 |
+
# Train
|
| 302 |
+
if train_size is not None:
|
| 303 |
+
train_vecs = vectors[:train_size] if column is None else vectors[:train_size][column]
|
| 304 |
+
logger.info(f"Training the index with the first {len(train_vecs)} vectors")
|
| 305 |
+
self.faiss_index.train(train_vecs)
|
| 306 |
+
else:
|
| 307 |
+
logger.info("Ignored the training step of the faiss index as `train_size` is None.")
|
| 308 |
+
|
| 309 |
+
# Add vectors
|
| 310 |
+
logger.info(f"Adding {len(vectors)} vectors to the faiss index")
|
| 311 |
+
for i in hf_tqdm(range(0, len(vectors), batch_size)):
|
| 312 |
+
vecs = vectors[i : i + batch_size] if column is None else vectors[i : i + batch_size][column]
|
| 313 |
+
self.faiss_index.add(vecs)
|
| 314 |
+
|
| 315 |
+
@staticmethod
|
| 316 |
+
def _faiss_index_to_device(index: "faiss.Index", device: Optional[Union[int, List[int]]] = None) -> "faiss.Index":
|
| 317 |
+
"""
|
| 318 |
+
Sends a faiss index to a device.
|
| 319 |
+
A device can either be a positive integer (GPU id), a negative integer (all GPUs),
|
| 320 |
+
or a list of positive integers (select GPUs to use), or `None` for CPU.
|
| 321 |
+
"""
|
| 322 |
+
|
| 323 |
+
# If device is not specified, then it runs on CPU.
|
| 324 |
+
if device is None:
|
| 325 |
+
return index
|
| 326 |
+
|
| 327 |
+
import faiss # noqa: F811
|
| 328 |
+
|
| 329 |
+
# If the device id is given as an integer
|
| 330 |
+
if isinstance(device, int):
|
| 331 |
+
# Positive integers are directly mapped to GPU ids
|
| 332 |
+
if device > -1:
|
| 333 |
+
faiss_res = faiss.StandardGpuResources()
|
| 334 |
+
index = faiss.index_cpu_to_gpu(faiss_res, device, index)
|
| 335 |
+
# And negative integers mean using all GPUs
|
| 336 |
+
else:
|
| 337 |
+
index = faiss.index_cpu_to_all_gpus(index)
|
| 338 |
+
# Device ids given as a list mean mapping to those devices specified.
|
| 339 |
+
elif isinstance(device, (list, tuple)):
|
| 340 |
+
index = faiss.index_cpu_to_gpus_list(index, gpus=list(device))
|
| 341 |
+
else:
|
| 342 |
+
raise TypeError(
|
| 343 |
+
f"The argument type: {type(device)} is not expected. "
|
| 344 |
+
+ "Please pass in either nothing, a positive int, a negative int, or a list of positive ints."
|
| 345 |
+
)
|
| 346 |
+
|
| 347 |
+
return index
|
| 348 |
+
|
| 349 |
+
def search(self, query: np.array, k=10, **kwargs) -> SearchResults:
|
| 350 |
+
"""Find the nearest examples indices to the query.
|
| 351 |
+
|
| 352 |
+
Args:
|
| 353 |
+
query (`np.array`): The query as a numpy array.
|
| 354 |
+
k (`int`): The number of examples to retrieve.
|
| 355 |
+
|
| 356 |
+
Ouput:
|
| 357 |
+
scores (`List[List[float]`): The retrieval scores of the retrieved examples.
|
| 358 |
+
indices (`List[List[int]]`): The indices of the retrieved examples.
|
| 359 |
+
"""
|
| 360 |
+
if len(query.shape) != 1 and (len(query.shape) != 2 or query.shape[0] != 1):
|
| 361 |
+
raise ValueError("Shape of query is incorrect, it has to be either a 1D array or 2D (1, N)")
|
| 362 |
+
|
| 363 |
+
queries = query.reshape(1, -1)
|
| 364 |
+
if not queries.flags.c_contiguous:
|
| 365 |
+
queries = np.asarray(queries, order="C")
|
| 366 |
+
scores, indices = self.faiss_index.search(queries, k, **kwargs)
|
| 367 |
+
return SearchResults(scores[0], indices[0].astype(int))
|
| 368 |
+
|
| 369 |
+
def search_batch(self, queries: np.array, k=10, **kwargs) -> BatchedSearchResults:
|
| 370 |
+
"""Find the nearest examples indices to the queries.
|
| 371 |
+
|
| 372 |
+
Args:
|
| 373 |
+
queries (`np.array`): The queries as a numpy array.
|
| 374 |
+
k (`int`): The number of examples to retrieve.
|
| 375 |
+
|
| 376 |
+
Ouput:
|
| 377 |
+
total_scores (`List[List[float]`): The retrieval scores of the retrieved examples per query.
|
| 378 |
+
total_indices (`List[List[int]]`): The indices of the retrieved examples per query.
|
| 379 |
+
"""
|
| 380 |
+
if len(queries.shape) != 2:
|
| 381 |
+
raise ValueError("Shape of query must be 2D")
|
| 382 |
+
if not queries.flags.c_contiguous:
|
| 383 |
+
queries = np.asarray(queries, order="C")
|
| 384 |
+
scores, indices = self.faiss_index.search(queries, k, **kwargs)
|
| 385 |
+
return BatchedSearchResults(scores, indices.astype(int))
|
| 386 |
+
|
| 387 |
+
def save(self, file: Union[str, PurePath], storage_options: Optional[Dict] = None):
|
| 388 |
+
"""Serialize the FaissIndex on disk"""
|
| 389 |
+
import faiss # noqa: F811
|
| 390 |
+
|
| 391 |
+
if self.device is not None and isinstance(self.device, (int, list, tuple)):
|
| 392 |
+
index = faiss.index_gpu_to_cpu(self.faiss_index)
|
| 393 |
+
else:
|
| 394 |
+
index = self.faiss_index
|
| 395 |
+
|
| 396 |
+
with fsspec.open(str(file), "wb", **(storage_options or {})) as f:
|
| 397 |
+
faiss.write_index(index, faiss.BufferedIOWriter(faiss.PyCallbackIOWriter(f.write)))
|
| 398 |
+
|
| 399 |
+
@classmethod
|
| 400 |
+
def load(
|
| 401 |
+
cls,
|
| 402 |
+
file: Union[str, PurePath],
|
| 403 |
+
device: Optional[Union[int, List[int]]] = None,
|
| 404 |
+
storage_options: Optional[Dict] = None,
|
| 405 |
+
) -> "FaissIndex":
|
| 406 |
+
"""Deserialize the FaissIndex from disk"""
|
| 407 |
+
import faiss # noqa: F811
|
| 408 |
+
|
| 409 |
+
# Instances of FaissIndex is essentially just a wrapper for faiss indices.
|
| 410 |
+
faiss_index = cls(device=device)
|
| 411 |
+
with fsspec.open(str(file), "rb", **(storage_options or {})) as f:
|
| 412 |
+
index = faiss.read_index(faiss.BufferedIOReader(faiss.PyCallbackIOReader(f.read)))
|
| 413 |
+
faiss_index.faiss_index = faiss_index._faiss_index_to_device(index, faiss_index.device)
|
| 414 |
+
return faiss_index
|
| 415 |
+
|
| 416 |
+
|
| 417 |
+
class IndexableMixin:
|
| 418 |
+
"""Add indexing features to `datasets.Dataset`"""
|
| 419 |
+
|
| 420 |
+
def __init__(self):
|
| 421 |
+
self._indexes: Dict[str, BaseIndex] = {}
|
| 422 |
+
|
| 423 |
+
def __len__(self):
|
| 424 |
+
raise NotImplementedError
|
| 425 |
+
|
| 426 |
+
def __getitem__(self, key):
|
| 427 |
+
raise NotImplementedError
|
| 428 |
+
|
| 429 |
+
def is_index_initialized(self, index_name: str) -> bool:
|
| 430 |
+
return index_name in self._indexes
|
| 431 |
+
|
| 432 |
+
def _check_index_is_initialized(self, index_name: str):
|
| 433 |
+
if not self.is_index_initialized(index_name):
|
| 434 |
+
raise MissingIndex(
|
| 435 |
+
f"Index with index_name '{index_name}' not initialized yet. Please make sure that you call `add_faiss_index` or `add_elasticsearch_index` first."
|
| 436 |
+
)
|
| 437 |
+
|
| 438 |
+
def list_indexes(self) -> List[str]:
|
| 439 |
+
"""List the `colindex_nameumns`/identifiers of all the attached indexes."""
|
| 440 |
+
return list(self._indexes)
|
| 441 |
+
|
| 442 |
+
def get_index(self, index_name: str) -> BaseIndex:
|
| 443 |
+
"""List the `index_name`/identifiers of all the attached indexes.
|
| 444 |
+
|
| 445 |
+
Args:
|
| 446 |
+
index_name (`str`): Index name.
|
| 447 |
+
|
| 448 |
+
Returns:
|
| 449 |
+
[`BaseIndex`]
|
| 450 |
+
"""
|
| 451 |
+
self._check_index_is_initialized(index_name)
|
| 452 |
+
return self._indexes[index_name]
|
| 453 |
+
|
| 454 |
+
def add_faiss_index(
|
| 455 |
+
self,
|
| 456 |
+
column: str,
|
| 457 |
+
index_name: Optional[str] = None,
|
| 458 |
+
device: Optional[Union[int, List[int]]] = None,
|
| 459 |
+
string_factory: Optional[str] = None,
|
| 460 |
+
metric_type: Optional[int] = None,
|
| 461 |
+
custom_index: Optional["faiss.Index"] = None,
|
| 462 |
+
batch_size: int = 1000,
|
| 463 |
+
train_size: Optional[int] = None,
|
| 464 |
+
faiss_verbose: bool = False,
|
| 465 |
+
):
|
| 466 |
+
"""Add a dense index using Faiss for fast retrieval.
|
| 467 |
+
The index is created using the vectors of the specified column.
|
| 468 |
+
You can specify `device` if you want to run it on GPU (`device` must be the GPU index, see more below).
|
| 469 |
+
You can find more information about Faiss here:
|
| 470 |
+
- For `string factory`: https://github.com/facebookresearch/faiss/wiki/The-index-factory
|
| 471 |
+
|
| 472 |
+
Args:
|
| 473 |
+
column (`str`): The column of the vectors to add to the index.
|
| 474 |
+
index_name (Optional `str`): The index_name/identifier of the index. This is the index_name that is used to call `.get_nearest` or `.search`.
|
| 475 |
+
By default it corresponds to `column`.
|
| 476 |
+
device (Optional `Union[int, List[int]]`): If positive integer, this is the index of the GPU to use. If negative integer, use all GPUs.
|
| 477 |
+
If a list of positive integers is passed in, run only on those GPUs. By default it uses the CPU.
|
| 478 |
+
string_factory (Optional `str`): This is passed to the index factory of Faiss to create the index. Default index class is IndexFlatIP.
|
| 479 |
+
metric_type (Optional `int`): Type of metric. Ex: `faiss.METRIC_INNER_PRODUCT` or `faiss.METRIC_L2`.
|
| 480 |
+
custom_index (Optional `faiss.Index`): Custom Faiss index that you already have instantiated and configured for your needs.
|
| 481 |
+
batch_size (Optional `int`): Size of the batch to use while adding vectors to the FaissIndex. Default value is 1000.
|
| 482 |
+
<Added version="2.4.0"/>
|
| 483 |
+
train_size (Optional `int`): If the index needs a training step, specifies how many vectors will be used to train the index.
|
| 484 |
+
faiss_verbose (`bool`, defaults to False): Enable the verbosity of the Faiss index.
|
| 485 |
+
"""
|
| 486 |
+
index_name = index_name if index_name is not None else column
|
| 487 |
+
faiss_index = FaissIndex(
|
| 488 |
+
device=device, string_factory=string_factory, metric_type=metric_type, custom_index=custom_index
|
| 489 |
+
)
|
| 490 |
+
faiss_index.add_vectors(
|
| 491 |
+
self, column=column, batch_size=batch_size, train_size=train_size, faiss_verbose=faiss_verbose
|
| 492 |
+
)
|
| 493 |
+
self._indexes[index_name] = faiss_index
|
| 494 |
+
|
| 495 |
+
def add_faiss_index_from_external_arrays(
|
| 496 |
+
self,
|
| 497 |
+
external_arrays: np.array,
|
| 498 |
+
index_name: str,
|
| 499 |
+
device: Optional[Union[int, List[int]]] = None,
|
| 500 |
+
string_factory: Optional[str] = None,
|
| 501 |
+
metric_type: Optional[int] = None,
|
| 502 |
+
custom_index: Optional["faiss.Index"] = None,
|
| 503 |
+
batch_size: int = 1000,
|
| 504 |
+
train_size: Optional[int] = None,
|
| 505 |
+
faiss_verbose: bool = False,
|
| 506 |
+
):
|
| 507 |
+
"""Add a dense index using Faiss for fast retrieval.
|
| 508 |
+
The index is created using the vectors of `external_arrays`.
|
| 509 |
+
You can specify `device` if you want to run it on GPU (`device` must be the GPU index).
|
| 510 |
+
You can find more information about Faiss here:
|
| 511 |
+
- For `string factory`: https://github.com/facebookresearch/faiss/wiki/The-index-factory
|
| 512 |
+
|
| 513 |
+
Args:
|
| 514 |
+
external_arrays (`np.array`): If you want to use arrays from outside the lib for the index, you can set `external_arrays`.
|
| 515 |
+
It will use `external_arrays` to create the Faiss index instead of the arrays in the given `column`.
|
| 516 |
+
index_name (`str`): The index_name/identifier of the index. This is the index_name that is used to call `.get_nearest` or `.search`.
|
| 517 |
+
device (Optional `Union[int, List[int]]`): If positive integer, this is the index of the GPU to use. If negative integer, use all GPUs.
|
| 518 |
+
If a list of positive integers is passed in, run only on those GPUs. By default it uses the CPU.
|
| 519 |
+
string_factory (Optional `str`): This is passed to the index factory of Faiss to create the index. Default index class is IndexFlatIP.
|
| 520 |
+
metric_type (Optional `int`): Type of metric. Ex: `faiss.METRIC_INNER_PRODUCT` or `faiss.METRIC_L2`.
|
| 521 |
+
custom_index (Optional `faiss.Index`): Custom Faiss index that you already have instantiated and configured for your needs.
|
| 522 |
+
batch_size (Optional `int`): Size of the batch to use while adding vectors to the FaissIndex. Default value is 1000.
|
| 523 |
+
<Added version="2.4.0"/>
|
| 524 |
+
train_size (Optional `int`): If the index needs a training step, specifies how many vectors will be used to train the index.
|
| 525 |
+
faiss_verbose (`bool`, defaults to False): Enable the verbosity of the Faiss index.
|
| 526 |
+
"""
|
| 527 |
+
faiss_index = FaissIndex(
|
| 528 |
+
device=device, string_factory=string_factory, metric_type=metric_type, custom_index=custom_index
|
| 529 |
+
)
|
| 530 |
+
faiss_index.add_vectors(
|
| 531 |
+
external_arrays, column=None, batch_size=batch_size, train_size=train_size, faiss_verbose=faiss_verbose
|
| 532 |
+
)
|
| 533 |
+
self._indexes[index_name] = faiss_index
|
| 534 |
+
|
| 535 |
+
def save_faiss_index(self, index_name: str, file: Union[str, PurePath], storage_options: Optional[Dict] = None):
|
| 536 |
+
"""Save a FaissIndex on disk.
|
| 537 |
+
|
| 538 |
+
Args:
|
| 539 |
+
index_name (`str`): The index_name/identifier of the index. This is the index_name that is used to call `.get_nearest` or `.search`.
|
| 540 |
+
file (`str`): The path to the serialized faiss index on disk or remote URI (e.g. `"s3://my-bucket/index.faiss"`).
|
| 541 |
+
storage_options (`dict`, *optional*):
|
| 542 |
+
Key/value pairs to be passed on to the file-system backend, if any.
|
| 543 |
+
|
| 544 |
+
<Added version="2.11.0"/>
|
| 545 |
+
|
| 546 |
+
"""
|
| 547 |
+
index = self.get_index(index_name)
|
| 548 |
+
if not isinstance(index, FaissIndex):
|
| 549 |
+
raise ValueError(f"Index '{index_name}' is not a FaissIndex but a '{type(index)}'")
|
| 550 |
+
index.save(file, storage_options=storage_options)
|
| 551 |
+
logger.info(f"Saved FaissIndex {index_name} at {file}")
|
| 552 |
+
|
| 553 |
+
def load_faiss_index(
|
| 554 |
+
self,
|
| 555 |
+
index_name: str,
|
| 556 |
+
file: Union[str, PurePath],
|
| 557 |
+
device: Optional[Union[int, List[int]]] = None,
|
| 558 |
+
storage_options: Optional[Dict] = None,
|
| 559 |
+
):
|
| 560 |
+
"""Load a FaissIndex from disk.
|
| 561 |
+
|
| 562 |
+
If you want to do additional configurations, you can have access to the faiss index object by doing
|
| 563 |
+
`.get_index(index_name).faiss_index` to make it fit your needs.
|
| 564 |
+
|
| 565 |
+
Args:
|
| 566 |
+
index_name (`str`): The index_name/identifier of the index. This is the index_name that is used to
|
| 567 |
+
call `.get_nearest` or `.search`.
|
| 568 |
+
file (`str`): The path to the serialized faiss index on disk or remote URI (e.g. `"s3://my-bucket/index.faiss"`).
|
| 569 |
+
device (Optional `Union[int, List[int]]`): If positive integer, this is the index of the GPU to use. If negative integer, use all GPUs.
|
| 570 |
+
If a list of positive integers is passed in, run only on those GPUs. By default it uses the CPU.
|
| 571 |
+
storage_options (`dict`, *optional*):
|
| 572 |
+
Key/value pairs to be passed on to the file-system backend, if any.
|
| 573 |
+
|
| 574 |
+
<Added version="2.11.0"/>
|
| 575 |
+
|
| 576 |
+
"""
|
| 577 |
+
index = FaissIndex.load(file, device=device, storage_options=storage_options)
|
| 578 |
+
if index.faiss_index.ntotal != len(self):
|
| 579 |
+
raise ValueError(
|
| 580 |
+
f"Index size should match Dataset size, but Index '{index_name}' at {file} has {index.faiss_index.ntotal} elements while the dataset has {len(self)} examples."
|
| 581 |
+
)
|
| 582 |
+
self._indexes[index_name] = index
|
| 583 |
+
logger.info(f"Loaded FaissIndex {index_name} from {file}")
|
| 584 |
+
|
| 585 |
+
def add_elasticsearch_index(
|
| 586 |
+
self,
|
| 587 |
+
column: str,
|
| 588 |
+
index_name: Optional[str] = None,
|
| 589 |
+
host: Optional[str] = None,
|
| 590 |
+
port: Optional[int] = None,
|
| 591 |
+
es_client: Optional["Elasticsearch"] = None,
|
| 592 |
+
es_index_name: Optional[str] = None,
|
| 593 |
+
es_index_config: Optional[dict] = None,
|
| 594 |
+
):
|
| 595 |
+
"""Add a text index using ElasticSearch for fast retrieval.
|
| 596 |
+
|
| 597 |
+
Args:
|
| 598 |
+
column (`str`): The column of the documents to add to the index.
|
| 599 |
+
index_name (Optional `str`): The index_name/identifier of the index. This is the index name that is used to call `.get_nearest` or `.search`.
|
| 600 |
+
By default it corresponds to `column`.
|
| 601 |
+
host (Optional `str`, defaults to localhost):
|
| 602 |
+
host of where ElasticSearch is running
|
| 603 |
+
port (Optional `str`, defaults to 9200):
|
| 604 |
+
port of where ElasticSearch is running
|
| 605 |
+
es_client (Optional `elasticsearch.Elasticsearch`):
|
| 606 |
+
The elasticsearch client used to create the index if host and port are None.
|
| 607 |
+
es_index_name (Optional `str`): The elasticsearch index name used to create the index.
|
| 608 |
+
es_index_config (Optional `dict`):
|
| 609 |
+
The configuration of the elasticsearch index.
|
| 610 |
+
Default config is:
|
| 611 |
+
|
| 612 |
+
Config::
|
| 613 |
+
|
| 614 |
+
{
|
| 615 |
+
"settings": {
|
| 616 |
+
"number_of_shards": 1,
|
| 617 |
+
"analysis": {"analyzer": {"stop_standard": {"type": "standard", " stopwords": "_english_"}}},
|
| 618 |
+
},
|
| 619 |
+
"mappings": {
|
| 620 |
+
"properties": {
|
| 621 |
+
"text": {
|
| 622 |
+
"type": "text",
|
| 623 |
+
"analyzer": "standard",
|
| 624 |
+
"similarity": "BM25"
|
| 625 |
+
},
|
| 626 |
+
}
|
| 627 |
+
},
|
| 628 |
+
}
|
| 629 |
+
"""
|
| 630 |
+
index_name = index_name if index_name is not None else column
|
| 631 |
+
es_index = ElasticSearchIndex(
|
| 632 |
+
host=host, port=port, es_client=es_client, es_index_name=es_index_name, es_index_config=es_index_config
|
| 633 |
+
)
|
| 634 |
+
es_index.add_documents(self, column=column)
|
| 635 |
+
self._indexes[index_name] = es_index
|
| 636 |
+
|
| 637 |
+
def load_elasticsearch_index(
|
| 638 |
+
self,
|
| 639 |
+
index_name: str,
|
| 640 |
+
es_index_name: str,
|
| 641 |
+
host: Optional[str] = None,
|
| 642 |
+
port: Optional[int] = None,
|
| 643 |
+
es_client: Optional["Elasticsearch"] = None,
|
| 644 |
+
es_index_config: Optional[dict] = None,
|
| 645 |
+
):
|
| 646 |
+
"""Load an existing text index using ElasticSearch for fast retrieval.
|
| 647 |
+
|
| 648 |
+
Args:
|
| 649 |
+
index_name (`str`):
|
| 650 |
+
The `index_name`/identifier of the index. This is the index name that is used to call `get_nearest` or `search`.
|
| 651 |
+
es_index_name (`str`):
|
| 652 |
+
The name of elasticsearch index to load.
|
| 653 |
+
host (`str`, *optional*, defaults to `localhost`):
|
| 654 |
+
Host of where ElasticSearch is running.
|
| 655 |
+
port (`str`, *optional*, defaults to `9200`):
|
| 656 |
+
Port of where ElasticSearch is running.
|
| 657 |
+
es_client (`elasticsearch.Elasticsearch`, *optional*):
|
| 658 |
+
The elasticsearch client used to create the index if host and port are `None`.
|
| 659 |
+
es_index_config (`dict`, *optional*):
|
| 660 |
+
The configuration of the elasticsearch index.
|
| 661 |
+
Default config is:
|
| 662 |
+
```
|
| 663 |
+
{
|
| 664 |
+
"settings": {
|
| 665 |
+
"number_of_shards": 1,
|
| 666 |
+
"analysis": {"analyzer": {"stop_standard": {"type": "standard", " stopwords": "_english_"}}},
|
| 667 |
+
},
|
| 668 |
+
"mappings": {
|
| 669 |
+
"properties": {
|
| 670 |
+
"text": {
|
| 671 |
+
"type": "text",
|
| 672 |
+
"analyzer": "standard",
|
| 673 |
+
"similarity": "BM25"
|
| 674 |
+
},
|
| 675 |
+
}
|
| 676 |
+
},
|
| 677 |
+
}
|
| 678 |
+
```
|
| 679 |
+
"""
|
| 680 |
+
self._indexes[index_name] = ElasticSearchIndex(
|
| 681 |
+
host=host, port=port, es_client=es_client, es_index_name=es_index_name, es_index_config=es_index_config
|
| 682 |
+
)
|
| 683 |
+
|
| 684 |
+
def drop_index(self, index_name: str):
|
| 685 |
+
"""Drop the index with the specified column.
|
| 686 |
+
|
| 687 |
+
Args:
|
| 688 |
+
index_name (`str`):
|
| 689 |
+
The `index_name`/identifier of the index.
|
| 690 |
+
"""
|
| 691 |
+
del self._indexes[index_name]
|
| 692 |
+
|
| 693 |
+
def search(self, index_name: str, query: Union[str, np.array], k: int = 10, **kwargs) -> SearchResults:
|
| 694 |
+
"""Find the nearest examples indices in the dataset to the query.
|
| 695 |
+
|
| 696 |
+
Args:
|
| 697 |
+
index_name (`str`):
|
| 698 |
+
The name/identifier of the index.
|
| 699 |
+
query (`Union[str, np.ndarray]`):
|
| 700 |
+
The query as a string if `index_name` is a text index or as a numpy array if `index_name` is a vector index.
|
| 701 |
+
k (`int`):
|
| 702 |
+
The number of examples to retrieve.
|
| 703 |
+
|
| 704 |
+
Returns:
|
| 705 |
+
`(scores, indices)`:
|
| 706 |
+
A tuple of `(scores, indices)` where:
|
| 707 |
+
- **scores** (`List[List[float]`): the retrieval scores from either FAISS (`IndexFlatL2` by default) or ElasticSearch of the retrieved examples
|
| 708 |
+
- **indices** (`List[List[int]]`): the indices of the retrieved examples
|
| 709 |
+
"""
|
| 710 |
+
self._check_index_is_initialized(index_name)
|
| 711 |
+
return self._indexes[index_name].search(query, k, **kwargs)
|
| 712 |
+
|
| 713 |
+
def search_batch(
|
| 714 |
+
self, index_name: str, queries: Union[List[str], np.array], k: int = 10, **kwargs
|
| 715 |
+
) -> BatchedSearchResults:
|
| 716 |
+
"""Find the nearest examples indices in the dataset to the query.
|
| 717 |
+
|
| 718 |
+
Args:
|
| 719 |
+
index_name (`str`):
|
| 720 |
+
The `index_name`/identifier of the index.
|
| 721 |
+
queries (`Union[List[str], np.ndarray]`):
|
| 722 |
+
The queries as a list of strings if `index_name` is a text index or as a numpy array if `index_name` is a vector index.
|
| 723 |
+
k (`int`):
|
| 724 |
+
The number of examples to retrieve per query.
|
| 725 |
+
|
| 726 |
+
Returns:
|
| 727 |
+
`(total_scores, total_indices)`:
|
| 728 |
+
A tuple of `(total_scores, total_indices)` where:
|
| 729 |
+
- **total_scores** (`List[List[float]`): the retrieval scores from either FAISS (`IndexFlatL2` by default) or ElasticSearch of the retrieved examples per query
|
| 730 |
+
- **total_indices** (`List[List[int]]`): the indices of the retrieved examples per query
|
| 731 |
+
"""
|
| 732 |
+
self._check_index_is_initialized(index_name)
|
| 733 |
+
return self._indexes[index_name].search_batch(queries, k, **kwargs)
|
| 734 |
+
|
| 735 |
+
def get_nearest_examples(
|
| 736 |
+
self, index_name: str, query: Union[str, np.array], k: int = 10, **kwargs
|
| 737 |
+
) -> NearestExamplesResults:
|
| 738 |
+
"""Find the nearest examples in the dataset to the query.
|
| 739 |
+
|
| 740 |
+
Args:
|
| 741 |
+
index_name (`str`):
|
| 742 |
+
The index_name/identifier of the index.
|
| 743 |
+
query (`Union[str, np.ndarray]`):
|
| 744 |
+
The query as a string if `index_name` is a text index or as a numpy array if `index_name` is a vector index.
|
| 745 |
+
k (`int`):
|
| 746 |
+
The number of examples to retrieve.
|
| 747 |
+
|
| 748 |
+
Returns:
|
| 749 |
+
`(scores, examples)`:
|
| 750 |
+
A tuple of `(scores, examples)` where:
|
| 751 |
+
- **scores** (`List[float]`): the retrieval scores from either FAISS (`IndexFlatL2` by default) or ElasticSearch of the retrieved examples
|
| 752 |
+
- **examples** (`dict`): the retrieved examples
|
| 753 |
+
"""
|
| 754 |
+
self._check_index_is_initialized(index_name)
|
| 755 |
+
scores, indices = self.search(index_name, query, k, **kwargs)
|
| 756 |
+
top_indices = [i for i in indices if i >= 0]
|
| 757 |
+
return NearestExamplesResults(scores[: len(top_indices)], self[top_indices])
|
| 758 |
+
|
| 759 |
+
def get_nearest_examples_batch(
|
| 760 |
+
self, index_name: str, queries: Union[List[str], np.array], k: int = 10, **kwargs
|
| 761 |
+
) -> BatchedNearestExamplesResults:
|
| 762 |
+
"""Find the nearest examples in the dataset to the query.
|
| 763 |
+
|
| 764 |
+
Args:
|
| 765 |
+
index_name (`str`):
|
| 766 |
+
The `index_name`/identifier of the index.
|
| 767 |
+
queries (`Union[List[str], np.ndarray]`):
|
| 768 |
+
The queries as a list of strings if `index_name` is a text index or as a numpy array if `index_name` is a vector index.
|
| 769 |
+
k (`int`):
|
| 770 |
+
The number of examples to retrieve per query.
|
| 771 |
+
|
| 772 |
+
Returns:
|
| 773 |
+
`(total_scores, total_examples)`:
|
| 774 |
+
A tuple of `(total_scores, total_examples)` where:
|
| 775 |
+
- **total_scores** (`List[List[float]`): the retrieval scores from either FAISS (`IndexFlatL2` by default) or ElasticSearch of the retrieved examples per query
|
| 776 |
+
- **total_examples** (`List[dict]`): the retrieved examples per query
|
| 777 |
+
"""
|
| 778 |
+
self._check_index_is_initialized(index_name)
|
| 779 |
+
total_scores, total_indices = self.search_batch(index_name, queries, k, **kwargs)
|
| 780 |
+
total_scores = [
|
| 781 |
+
scores_i[: len([i for i in indices_i if i >= 0])]
|
| 782 |
+
for scores_i, indices_i in zip(total_scores, total_indices)
|
| 783 |
+
]
|
| 784 |
+
total_samples = [self[[i for i in indices if i >= 0]] for indices in total_indices]
|
| 785 |
+
return BatchedNearestExamplesResults(total_scores, total_samples)
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/splits.py
ADDED
|
@@ -0,0 +1,635 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
# Lint as: python3
|
| 16 |
+
"""Splits related API."""
|
| 17 |
+
|
| 18 |
+
import abc
|
| 19 |
+
import collections
|
| 20 |
+
import copy
|
| 21 |
+
import dataclasses
|
| 22 |
+
import re
|
| 23 |
+
from dataclasses import dataclass
|
| 24 |
+
from typing import Dict, List, Optional, Union
|
| 25 |
+
|
| 26 |
+
from .arrow_reader import FileInstructions, make_file_instructions
|
| 27 |
+
from .naming import _split_re
|
| 28 |
+
from .utils.py_utils import NonMutableDict, asdict
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
@dataclass
|
| 32 |
+
class SplitInfo:
|
| 33 |
+
name: str = dataclasses.field(default="", metadata={"include_in_asdict_even_if_is_default": True})
|
| 34 |
+
num_bytes: int = dataclasses.field(default=0, metadata={"include_in_asdict_even_if_is_default": True})
|
| 35 |
+
num_examples: int = dataclasses.field(default=0, metadata={"include_in_asdict_even_if_is_default": True})
|
| 36 |
+
shard_lengths: Optional[List[int]] = None
|
| 37 |
+
|
| 38 |
+
# Deprecated
|
| 39 |
+
# For backward compatibility, this field needs to always be included in files like
|
| 40 |
+
# dataset_infos.json and dataset_info.json files
|
| 41 |
+
# To do so, we always include it in the output of datasets.utils.py_utils.asdict(split_info)
|
| 42 |
+
dataset_name: Optional[str] = dataclasses.field(
|
| 43 |
+
default=None, metadata={"include_in_asdict_even_if_is_default": True}
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
@property
|
| 47 |
+
def file_instructions(self):
|
| 48 |
+
"""Returns the list of dict(filename, take, skip)."""
|
| 49 |
+
# `self.dataset_name` is assigned in `SplitDict.add()`.
|
| 50 |
+
instructions = make_file_instructions(
|
| 51 |
+
name=self.dataset_name,
|
| 52 |
+
split_infos=[self],
|
| 53 |
+
instruction=str(self.name),
|
| 54 |
+
)
|
| 55 |
+
return instructions.file_instructions
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
@dataclass
|
| 59 |
+
class SubSplitInfo:
|
| 60 |
+
"""Wrapper around a sub split info.
|
| 61 |
+
This class expose info on the subsplit:
|
| 62 |
+
```
|
| 63 |
+
ds, info = datasets.load_dataset(..., split='train[75%:]', with_info=True)
|
| 64 |
+
info.splits['train[75%:]'].num_examples
|
| 65 |
+
```
|
| 66 |
+
"""
|
| 67 |
+
|
| 68 |
+
instructions: FileInstructions
|
| 69 |
+
|
| 70 |
+
@property
|
| 71 |
+
def num_examples(self):
|
| 72 |
+
"""Returns the number of example in the subsplit."""
|
| 73 |
+
return self.instructions.num_examples
|
| 74 |
+
|
| 75 |
+
@property
|
| 76 |
+
def file_instructions(self):
|
| 77 |
+
"""Returns the list of dict(filename, take, skip)."""
|
| 78 |
+
return self.instructions.file_instructions
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
class SplitBase(metaclass=abc.ABCMeta):
|
| 82 |
+
# pylint: disable=line-too-long
|
| 83 |
+
"""Abstract base class for Split compositionality.
|
| 84 |
+
|
| 85 |
+
See the
|
| 86 |
+
[guide on splits](../loading#slice-splits)
|
| 87 |
+
for more information.
|
| 88 |
+
|
| 89 |
+
There are three parts to the composition:
|
| 90 |
+
1) The splits are composed (defined, merged, split,...) together before
|
| 91 |
+
calling the `.as_dataset()` function. This is done with the `__add__`,
|
| 92 |
+
`__getitem__`, which return a tree of `SplitBase` (whose leaf
|
| 93 |
+
are the `NamedSplit` objects)
|
| 94 |
+
|
| 95 |
+
```
|
| 96 |
+
split = datasets.Split.TRAIN + datasets.Split.TEST.subsplit(datasets.percent[:50])
|
| 97 |
+
```
|
| 98 |
+
|
| 99 |
+
2) The `SplitBase` is forwarded to the `.as_dataset()` function
|
| 100 |
+
to be resolved into actual read instruction. This is done by the
|
| 101 |
+
`.get_read_instruction()` method which takes the real dataset splits
|
| 102 |
+
(name, number of shards,...) and parse the tree to return a
|
| 103 |
+
`SplitReadInstruction()` object
|
| 104 |
+
|
| 105 |
+
```
|
| 106 |
+
read_instruction = split.get_read_instruction(self.info.splits)
|
| 107 |
+
```
|
| 108 |
+
|
| 109 |
+
3) The `SplitReadInstruction` is then used in the `tf.data.Dataset` pipeline
|
| 110 |
+
to define which files to read and how to skip examples within file.
|
| 111 |
+
|
| 112 |
+
"""
|
| 113 |
+
|
| 114 |
+
# pylint: enable=line-too-long
|
| 115 |
+
|
| 116 |
+
@abc.abstractmethod
|
| 117 |
+
def get_read_instruction(self, split_dict):
|
| 118 |
+
"""Parse the descriptor tree and compile all read instructions together.
|
| 119 |
+
|
| 120 |
+
Args:
|
| 121 |
+
split_dict: `dict`, The `dict[split_name, SplitInfo]` of the dataset
|
| 122 |
+
|
| 123 |
+
Returns:
|
| 124 |
+
split_read_instruction: `SplitReadInstruction`
|
| 125 |
+
"""
|
| 126 |
+
raise NotImplementedError("Abstract method")
|
| 127 |
+
|
| 128 |
+
def __eq__(self, other):
|
| 129 |
+
"""Equality: datasets.Split.TRAIN == 'train'."""
|
| 130 |
+
if isinstance(other, (NamedSplit, str)):
|
| 131 |
+
return False
|
| 132 |
+
raise NotImplementedError("Equality is not implemented between merged/sub splits.")
|
| 133 |
+
|
| 134 |
+
def __ne__(self, other):
|
| 135 |
+
"""InEquality: datasets.Split.TRAIN != 'test'."""
|
| 136 |
+
return not self.__eq__(other)
|
| 137 |
+
|
| 138 |
+
def __add__(self, other):
|
| 139 |
+
"""Merging: datasets.Split.TRAIN + datasets.Split.TEST."""
|
| 140 |
+
return _SplitMerged(self, other)
|
| 141 |
+
|
| 142 |
+
def subsplit(self, arg=None, k=None, percent=None, weighted=None): # pylint: disable=redefined-outer-name
|
| 143 |
+
"""Divides this split into subsplits.
|
| 144 |
+
|
| 145 |
+
There are 3 ways to define subsplits, which correspond to the 3
|
| 146 |
+
arguments `k` (get `k` even subsplits), `percent` (get a slice of the
|
| 147 |
+
dataset with `datasets.percent`), and `weighted` (get subsplits with proportions
|
| 148 |
+
specified by `weighted`).
|
| 149 |
+
|
| 150 |
+
Example::
|
| 151 |
+
|
| 152 |
+
```
|
| 153 |
+
# 50% train, 50% test
|
| 154 |
+
train, test = split.subsplit(k=2)
|
| 155 |
+
# 50% train, 25% test, 25% validation
|
| 156 |
+
train, test, validation = split.subsplit(weighted=[2, 1, 1])
|
| 157 |
+
# Extract last 20%
|
| 158 |
+
subsplit = split.subsplit(datasets.percent[-20:])
|
| 159 |
+
```
|
| 160 |
+
|
| 161 |
+
Warning: k and weighted will be converted into percent which mean that
|
| 162 |
+
values below the percent will be rounded up or down. The final split may be
|
| 163 |
+
bigger to deal with remainders. For instance:
|
| 164 |
+
|
| 165 |
+
```
|
| 166 |
+
train, test, valid = split.subsplit(k=3) # 33%, 33%, 34%
|
| 167 |
+
s1, s2, s3, s4 = split.subsplit(weighted=[2, 2, 1, 1]) # 33%, 33%, 16%, 18%
|
| 168 |
+
```
|
| 169 |
+
|
| 170 |
+
Args:
|
| 171 |
+
arg: If no kwargs are given, `arg` will be interpreted as one of
|
| 172 |
+
`k`, `percent`, or `weighted` depending on the type.
|
| 173 |
+
For example:
|
| 174 |
+
```
|
| 175 |
+
split.subsplit(10) # Equivalent to split.subsplit(k=10)
|
| 176 |
+
split.subsplit(datasets.percent[:-20]) # percent=datasets.percent[:-20]
|
| 177 |
+
split.subsplit([1, 1, 2]) # weighted=[1, 1, 2]
|
| 178 |
+
```
|
| 179 |
+
k: `int` If set, subdivide the split into `k` equal parts.
|
| 180 |
+
percent: `datasets.percent slice`, return a single subsplit corresponding to
|
| 181 |
+
a slice of the original split. For example:
|
| 182 |
+
`split.subsplit(datasets.percent[-20:]) # Last 20% of the dataset`.
|
| 183 |
+
weighted: `list[int]`, return a list of subsplits whose proportions match
|
| 184 |
+
the normalized sum of the list. For example:
|
| 185 |
+
`split.subsplit(weighted=[1, 1, 2]) # 25%, 25%, 50%`.
|
| 186 |
+
|
| 187 |
+
Returns:
|
| 188 |
+
A subsplit or list of subsplits extracted from this split object.
|
| 189 |
+
"""
|
| 190 |
+
# Note that the percent kwargs redefine the outer name datasets.percent. This
|
| 191 |
+
# is done for consistency (.subsplit(percent=datasets.percent[:40]))
|
| 192 |
+
if sum(bool(x) for x in (arg, k, percent, weighted)) != 1:
|
| 193 |
+
raise ValueError("Only one argument of subsplit should be set.")
|
| 194 |
+
|
| 195 |
+
# Auto deduce k
|
| 196 |
+
if isinstance(arg, int):
|
| 197 |
+
k = arg
|
| 198 |
+
elif isinstance(arg, slice):
|
| 199 |
+
percent = arg
|
| 200 |
+
elif isinstance(arg, list):
|
| 201 |
+
weighted = arg
|
| 202 |
+
|
| 203 |
+
if not (k or percent or weighted):
|
| 204 |
+
raise ValueError(
|
| 205 |
+
f"Invalid split argument {arg}. Only list, slice and int supported. "
|
| 206 |
+
"One of k, weighted or percent should be set to a non empty value."
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
def assert_slices_coverage(slices):
|
| 210 |
+
# Ensure that the expended slices cover all percents.
|
| 211 |
+
assert sum((list(range(*s.indices(100))) for s in slices), []) == list(range(100))
|
| 212 |
+
|
| 213 |
+
if k:
|
| 214 |
+
if not 0 < k <= 100:
|
| 215 |
+
raise ValueError(f"Subsplit k should be between 0 and 100, got {k}")
|
| 216 |
+
shift = 100 // k
|
| 217 |
+
slices = [slice(i * shift, (i + 1) * shift) for i in range(k)]
|
| 218 |
+
# Round up last element to ensure all elements are taken
|
| 219 |
+
slices[-1] = slice(slices[-1].start, 100)
|
| 220 |
+
# Internal check to ensure full coverage
|
| 221 |
+
assert_slices_coverage(slices)
|
| 222 |
+
return tuple(_SubSplit(self, s) for s in slices)
|
| 223 |
+
elif percent:
|
| 224 |
+
return _SubSplit(self, percent)
|
| 225 |
+
elif weighted:
|
| 226 |
+
# Normalize the weighted sum
|
| 227 |
+
total = sum(weighted)
|
| 228 |
+
weighted = [100 * x // total for x in weighted]
|
| 229 |
+
# Create the slice for each of the elements
|
| 230 |
+
start = 0
|
| 231 |
+
stop = 0
|
| 232 |
+
slices = []
|
| 233 |
+
for v in weighted:
|
| 234 |
+
stop += v
|
| 235 |
+
slices.append(slice(start, stop))
|
| 236 |
+
start = stop
|
| 237 |
+
# Round up last element to ensure all elements are taken
|
| 238 |
+
slices[-1] = slice(slices[-1].start, 100)
|
| 239 |
+
# Internal check to ensure full coverage
|
| 240 |
+
assert_slices_coverage(slices)
|
| 241 |
+
return tuple(_SubSplit(self, s) for s in slices)
|
| 242 |
+
else:
|
| 243 |
+
# Should not be possible
|
| 244 |
+
raise ValueError("Could not determine the split")
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
# 2 requirements:
|
| 248 |
+
# 1. datasets.percent be sliceable
|
| 249 |
+
# 2. datasets.percent be documented
|
| 250 |
+
#
|
| 251 |
+
# Instances are not documented, so we want datasets.percent to be a class, but to
|
| 252 |
+
# have it be sliceable, we need this metaclass.
|
| 253 |
+
class PercentSliceMeta(type):
|
| 254 |
+
def __getitem__(cls, slice_value):
|
| 255 |
+
if not isinstance(slice_value, slice):
|
| 256 |
+
raise ValueError(f"datasets.percent should only be called with slice, not {slice_value}")
|
| 257 |
+
return slice_value
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
class PercentSlice(metaclass=PercentSliceMeta):
|
| 261 |
+
# pylint: disable=line-too-long
|
| 262 |
+
"""Syntactic sugar for defining slice subsplits: `datasets.percent[75:-5]`.
|
| 263 |
+
|
| 264 |
+
See the
|
| 265 |
+
[guide on splits](../loading#slice-splits)
|
| 266 |
+
for more information.
|
| 267 |
+
"""
|
| 268 |
+
|
| 269 |
+
# pylint: enable=line-too-long
|
| 270 |
+
pass
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
percent = PercentSlice # pylint: disable=invalid-name
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
class _SplitMerged(SplitBase):
|
| 277 |
+
"""Represent two split descriptors merged together."""
|
| 278 |
+
|
| 279 |
+
def __init__(self, split1, split2):
|
| 280 |
+
self._split1 = split1
|
| 281 |
+
self._split2 = split2
|
| 282 |
+
|
| 283 |
+
def get_read_instruction(self, split_dict):
|
| 284 |
+
read_instruction1 = self._split1.get_read_instruction(split_dict)
|
| 285 |
+
read_instruction2 = self._split2.get_read_instruction(split_dict)
|
| 286 |
+
return read_instruction1 + read_instruction2
|
| 287 |
+
|
| 288 |
+
def __repr__(self):
|
| 289 |
+
return f"({repr(self._split1)} + {repr(self._split2)})"
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
class _SubSplit(SplitBase):
|
| 293 |
+
"""Represent a sub split of a split descriptor."""
|
| 294 |
+
|
| 295 |
+
def __init__(self, split, slice_value):
|
| 296 |
+
self._split = split
|
| 297 |
+
self._slice_value = slice_value
|
| 298 |
+
|
| 299 |
+
def get_read_instruction(self, split_dict):
|
| 300 |
+
return self._split.get_read_instruction(split_dict)[self._slice_value]
|
| 301 |
+
|
| 302 |
+
def __repr__(self):
|
| 303 |
+
slice_str = "{start}:{stop}"
|
| 304 |
+
if self._slice_value.step is not None:
|
| 305 |
+
slice_str += ":{step}"
|
| 306 |
+
slice_str = slice_str.format(
|
| 307 |
+
start="" if self._slice_value.start is None else self._slice_value.start,
|
| 308 |
+
stop="" if self._slice_value.stop is None else self._slice_value.stop,
|
| 309 |
+
step=self._slice_value.step,
|
| 310 |
+
)
|
| 311 |
+
return f"{repr(self._split)}(datasets.percent[{slice_str}])"
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
class NamedSplit(SplitBase):
|
| 315 |
+
"""Descriptor corresponding to a named split (train, test, ...).
|
| 316 |
+
|
| 317 |
+
Example:
|
| 318 |
+
Each descriptor can be composed with other using addition or slice:
|
| 319 |
+
|
| 320 |
+
```py
|
| 321 |
+
split = datasets.Split.TRAIN.subsplit(datasets.percent[0:25]) + datasets.Split.TEST
|
| 322 |
+
```
|
| 323 |
+
|
| 324 |
+
The resulting split will correspond to 25% of the train split merged with
|
| 325 |
+
100% of the test split.
|
| 326 |
+
|
| 327 |
+
A split cannot be added twice, so the following will fail:
|
| 328 |
+
|
| 329 |
+
```py
|
| 330 |
+
split = (
|
| 331 |
+
datasets.Split.TRAIN.subsplit(datasets.percent[:25]) +
|
| 332 |
+
datasets.Split.TRAIN.subsplit(datasets.percent[75:])
|
| 333 |
+
) # Error
|
| 334 |
+
split = datasets.Split.TEST + datasets.Split.ALL # Error
|
| 335 |
+
```
|
| 336 |
+
|
| 337 |
+
The slices can be applied only one time. So the following are valid:
|
| 338 |
+
|
| 339 |
+
```py
|
| 340 |
+
split = (
|
| 341 |
+
datasets.Split.TRAIN.subsplit(datasets.percent[:25]) +
|
| 342 |
+
datasets.Split.TEST.subsplit(datasets.percent[:50])
|
| 343 |
+
)
|
| 344 |
+
split = (datasets.Split.TRAIN + datasets.Split.TEST).subsplit(datasets.percent[:50])
|
| 345 |
+
```
|
| 346 |
+
|
| 347 |
+
But this is not valid:
|
| 348 |
+
|
| 349 |
+
```py
|
| 350 |
+
train = datasets.Split.TRAIN
|
| 351 |
+
test = datasets.Split.TEST
|
| 352 |
+
split = train.subsplit(datasets.percent[:25]).subsplit(datasets.percent[:25])
|
| 353 |
+
split = (train.subsplit(datasets.percent[:25]) + test).subsplit(datasets.percent[:50])
|
| 354 |
+
```
|
| 355 |
+
"""
|
| 356 |
+
|
| 357 |
+
def __init__(self, name):
|
| 358 |
+
self._name = name
|
| 359 |
+
split_names_from_instruction = [split_instruction.split("[")[0] for split_instruction in name.split("+")]
|
| 360 |
+
for split_name in split_names_from_instruction:
|
| 361 |
+
if not re.match(_split_re, split_name):
|
| 362 |
+
raise ValueError(f"Split name should match '{_split_re}' but got '{split_name}'.")
|
| 363 |
+
|
| 364 |
+
def __str__(self):
|
| 365 |
+
return self._name
|
| 366 |
+
|
| 367 |
+
def __repr__(self):
|
| 368 |
+
return f"NamedSplit({self._name!r})"
|
| 369 |
+
|
| 370 |
+
def __eq__(self, other):
|
| 371 |
+
"""Equality: datasets.Split.TRAIN == 'train'."""
|
| 372 |
+
if isinstance(other, NamedSplit):
|
| 373 |
+
return self._name == other._name # pylint: disable=protected-access
|
| 374 |
+
elif isinstance(other, SplitBase):
|
| 375 |
+
return False
|
| 376 |
+
elif isinstance(other, str): # Other should be string
|
| 377 |
+
return self._name == other
|
| 378 |
+
else:
|
| 379 |
+
raise ValueError(f"Equality not supported between split {self} and {other}")
|
| 380 |
+
|
| 381 |
+
def __lt__(self, other):
|
| 382 |
+
return self._name < other._name # pylint: disable=protected-access
|
| 383 |
+
|
| 384 |
+
def __hash__(self):
|
| 385 |
+
return hash(self._name)
|
| 386 |
+
|
| 387 |
+
def get_read_instruction(self, split_dict):
|
| 388 |
+
return SplitReadInstruction(split_dict[self._name])
|
| 389 |
+
|
| 390 |
+
|
| 391 |
+
class NamedSplitAll(NamedSplit):
|
| 392 |
+
"""Split corresponding to the union of all defined dataset splits."""
|
| 393 |
+
|
| 394 |
+
def __init__(self):
|
| 395 |
+
super().__init__("all")
|
| 396 |
+
|
| 397 |
+
def __repr__(self):
|
| 398 |
+
return "NamedSplitAll()"
|
| 399 |
+
|
| 400 |
+
def get_read_instruction(self, split_dict):
|
| 401 |
+
# Merge all dataset split together
|
| 402 |
+
read_instructions = [SplitReadInstruction(s) for s in split_dict.values()]
|
| 403 |
+
return sum(read_instructions, SplitReadInstruction())
|
| 404 |
+
|
| 405 |
+
|
| 406 |
+
class Split:
|
| 407 |
+
# pylint: disable=line-too-long
|
| 408 |
+
"""`Enum` for dataset splits.
|
| 409 |
+
|
| 410 |
+
Datasets are typically split into different subsets to be used at various
|
| 411 |
+
stages of training and evaluation.
|
| 412 |
+
|
| 413 |
+
- `TRAIN`: the training data.
|
| 414 |
+
- `VALIDATION`: the validation data. If present, this is typically used as
|
| 415 |
+
evaluation data while iterating on a model (e.g. changing hyperparameters,
|
| 416 |
+
model architecture, etc.).
|
| 417 |
+
- `TEST`: the testing data. This is the data to report metrics on. Typically
|
| 418 |
+
you do not want to use this during model iteration as you may overfit to it.
|
| 419 |
+
- `ALL`: the union of all defined dataset splits.
|
| 420 |
+
|
| 421 |
+
All splits, including compositions inherit from `datasets.SplitBase`.
|
| 422 |
+
|
| 423 |
+
See the [guide](../load_hub#splits) on splits for more information.
|
| 424 |
+
|
| 425 |
+
Example:
|
| 426 |
+
|
| 427 |
+
```py
|
| 428 |
+
>>> datasets.SplitGenerator(
|
| 429 |
+
... name=datasets.Split.TRAIN,
|
| 430 |
+
... gen_kwargs={"split_key": "train", "files": dl_manager.download_and extract(url)},
|
| 431 |
+
... ),
|
| 432 |
+
... datasets.SplitGenerator(
|
| 433 |
+
... name=datasets.Split.VALIDATION,
|
| 434 |
+
... gen_kwargs={"split_key": "validation", "files": dl_manager.download_and extract(url)},
|
| 435 |
+
... ),
|
| 436 |
+
... datasets.SplitGenerator(
|
| 437 |
+
... name=datasets.Split.TEST,
|
| 438 |
+
... gen_kwargs={"split_key": "test", "files": dl_manager.download_and extract(url)},
|
| 439 |
+
... )
|
| 440 |
+
```
|
| 441 |
+
"""
|
| 442 |
+
|
| 443 |
+
# pylint: enable=line-too-long
|
| 444 |
+
TRAIN = NamedSplit("train")
|
| 445 |
+
TEST = NamedSplit("test")
|
| 446 |
+
VALIDATION = NamedSplit("validation")
|
| 447 |
+
ALL = NamedSplitAll()
|
| 448 |
+
|
| 449 |
+
def __new__(cls, name):
|
| 450 |
+
"""Create a custom split with datasets.Split('custom_name')."""
|
| 451 |
+
return NamedSplitAll() if name == "all" else NamedSplit(name)
|
| 452 |
+
|
| 453 |
+
|
| 454 |
+
# Similar to SplitInfo, but contain an additional slice info
|
| 455 |
+
SlicedSplitInfo = collections.namedtuple(
|
| 456 |
+
"SlicedSplitInfo",
|
| 457 |
+
[
|
| 458 |
+
"split_info",
|
| 459 |
+
"slice_value",
|
| 460 |
+
],
|
| 461 |
+
) # noqa: E231
|
| 462 |
+
|
| 463 |
+
|
| 464 |
+
class SplitReadInstruction:
|
| 465 |
+
"""Object containing the reading instruction for the dataset.
|
| 466 |
+
|
| 467 |
+
Similarly to `SplitDescriptor` nodes, this object can be composed with itself,
|
| 468 |
+
but the resolution happens instantaneously, instead of keeping track of the
|
| 469 |
+
tree, such as all instructions are compiled and flattened in a single
|
| 470 |
+
SplitReadInstruction object containing the list of files and slice to use.
|
| 471 |
+
|
| 472 |
+
Once resolved, the instructions can be accessed with:
|
| 473 |
+
|
| 474 |
+
```
|
| 475 |
+
read_instructions.get_list_sliced_split_info() # List of splits to use
|
| 476 |
+
```
|
| 477 |
+
|
| 478 |
+
"""
|
| 479 |
+
|
| 480 |
+
def __init__(self, split_info=None):
|
| 481 |
+
self._splits = NonMutableDict(error_msg="Overlap between splits. Split {key} has been added with " "itself.")
|
| 482 |
+
|
| 483 |
+
if split_info:
|
| 484 |
+
self.add(SlicedSplitInfo(split_info=split_info, slice_value=None))
|
| 485 |
+
|
| 486 |
+
def add(self, sliced_split):
|
| 487 |
+
"""Add a SlicedSplitInfo the read instructions."""
|
| 488 |
+
# TODO(epot): Check that the number of examples per shard % 100 == 0
|
| 489 |
+
# Otherwise the slices value may be unbalanced and not exactly reflect the
|
| 490 |
+
# requested slice.
|
| 491 |
+
self._splits[sliced_split.split_info.name] = sliced_split
|
| 492 |
+
|
| 493 |
+
def __add__(self, other):
|
| 494 |
+
"""Merging split together."""
|
| 495 |
+
# Will raise error if a split has already be added (NonMutableDict)
|
| 496 |
+
# TODO(epot): If a split is already added but there is no overlap between
|
| 497 |
+
# the slices, should merge the slices (ex: [:10] + [80:])
|
| 498 |
+
split_instruction = SplitReadInstruction()
|
| 499 |
+
split_instruction._splits.update(self._splits) # pylint: disable=protected-access
|
| 500 |
+
split_instruction._splits.update(other._splits) # pylint: disable=protected-access
|
| 501 |
+
return split_instruction
|
| 502 |
+
|
| 503 |
+
def __getitem__(self, slice_value):
|
| 504 |
+
"""Sub-splits."""
|
| 505 |
+
# Will raise an error if a split has already been sliced
|
| 506 |
+
split_instruction = SplitReadInstruction()
|
| 507 |
+
for v in self._splits.values():
|
| 508 |
+
if v.slice_value is not None:
|
| 509 |
+
raise ValueError(f"Trying to slice Split {v.split_info.name} which has already been sliced")
|
| 510 |
+
v = v._asdict()
|
| 511 |
+
v["slice_value"] = slice_value
|
| 512 |
+
split_instruction.add(SlicedSplitInfo(**v))
|
| 513 |
+
return split_instruction
|
| 514 |
+
|
| 515 |
+
def get_list_sliced_split_info(self):
|
| 516 |
+
return list(self._splits.values())
|
| 517 |
+
|
| 518 |
+
|
| 519 |
+
class SplitDict(dict):
|
| 520 |
+
"""Split info object."""
|
| 521 |
+
|
| 522 |
+
def __init__(self, *args, dataset_name=None, **kwargs):
|
| 523 |
+
super().__init__(*args, **kwargs)
|
| 524 |
+
self.dataset_name = dataset_name
|
| 525 |
+
|
| 526 |
+
def __getitem__(self, key: Union[SplitBase, str]):
|
| 527 |
+
# 1st case: The key exists: `info.splits['train']`
|
| 528 |
+
if str(key) in self:
|
| 529 |
+
return super().__getitem__(str(key))
|
| 530 |
+
# 2nd case: Uses instructions: `info.splits['train[50%]']`
|
| 531 |
+
else:
|
| 532 |
+
instructions = make_file_instructions(
|
| 533 |
+
name=self.dataset_name,
|
| 534 |
+
split_infos=self.values(),
|
| 535 |
+
instruction=key,
|
| 536 |
+
)
|
| 537 |
+
return SubSplitInfo(instructions)
|
| 538 |
+
|
| 539 |
+
def __setitem__(self, key: Union[SplitBase, str], value: SplitInfo):
|
| 540 |
+
if key != value.name:
|
| 541 |
+
raise ValueError(f"Cannot add elem. (key mismatch: '{key}' != '{value.name}')")
|
| 542 |
+
super().__setitem__(key, value)
|
| 543 |
+
|
| 544 |
+
def add(self, split_info: SplitInfo):
|
| 545 |
+
"""Add the split info."""
|
| 546 |
+
if split_info.name in self:
|
| 547 |
+
raise ValueError(f"Split {split_info.name} already present")
|
| 548 |
+
split_info.dataset_name = self.dataset_name
|
| 549 |
+
super().__setitem__(split_info.name, split_info)
|
| 550 |
+
|
| 551 |
+
@property
|
| 552 |
+
def total_num_examples(self):
|
| 553 |
+
"""Return the total number of examples."""
|
| 554 |
+
return sum(s.num_examples for s in self.values())
|
| 555 |
+
|
| 556 |
+
@classmethod
|
| 557 |
+
def from_split_dict(cls, split_infos: Union[List, Dict], dataset_name: Optional[str] = None):
|
| 558 |
+
"""Returns a new SplitDict initialized from a Dict or List of `split_infos`."""
|
| 559 |
+
if isinstance(split_infos, dict):
|
| 560 |
+
split_infos = list(split_infos.values())
|
| 561 |
+
|
| 562 |
+
if dataset_name is None:
|
| 563 |
+
dataset_name = split_infos[0].get("dataset_name") if split_infos else None
|
| 564 |
+
|
| 565 |
+
split_dict = cls(dataset_name=dataset_name)
|
| 566 |
+
|
| 567 |
+
for split_info in split_infos:
|
| 568 |
+
if isinstance(split_info, dict):
|
| 569 |
+
split_info = SplitInfo(**split_info)
|
| 570 |
+
split_dict.add(split_info)
|
| 571 |
+
|
| 572 |
+
return split_dict
|
| 573 |
+
|
| 574 |
+
def to_split_dict(self):
|
| 575 |
+
"""Returns a list of SplitInfo protos that we have."""
|
| 576 |
+
out = []
|
| 577 |
+
for split_name, split_info in self.items():
|
| 578 |
+
split_info = copy.deepcopy(split_info)
|
| 579 |
+
split_info.name = split_name
|
| 580 |
+
out.append(split_info)
|
| 581 |
+
return out
|
| 582 |
+
|
| 583 |
+
def copy(self):
|
| 584 |
+
return SplitDict.from_split_dict(self.to_split_dict(), self.dataset_name)
|
| 585 |
+
|
| 586 |
+
def _to_yaml_list(self) -> list:
|
| 587 |
+
out = [asdict(s) for s in self.to_split_dict()]
|
| 588 |
+
# we don't need the shard lengths in YAML, since it depends on max_shard_size and num_proc
|
| 589 |
+
for split_info_dict in out:
|
| 590 |
+
split_info_dict.pop("shard_lengths", None)
|
| 591 |
+
# we don't need the dataset_name attribute that is deprecated
|
| 592 |
+
for split_info_dict in out:
|
| 593 |
+
split_info_dict.pop("dataset_name", None)
|
| 594 |
+
return out
|
| 595 |
+
|
| 596 |
+
@classmethod
|
| 597 |
+
def _from_yaml_list(cls, yaml_data: list) -> "SplitDict":
|
| 598 |
+
return cls.from_split_dict(yaml_data)
|
| 599 |
+
|
| 600 |
+
|
| 601 |
+
@dataclass
|
| 602 |
+
class SplitGenerator:
|
| 603 |
+
"""Defines the split information for the generator.
|
| 604 |
+
|
| 605 |
+
This should be used as returned value of
|
| 606 |
+
`GeneratorBasedBuilder._split_generators`.
|
| 607 |
+
See `GeneratorBasedBuilder._split_generators` for more info and example
|
| 608 |
+
of usage.
|
| 609 |
+
|
| 610 |
+
Args:
|
| 611 |
+
name (`str`):
|
| 612 |
+
Name of the `Split` for which the generator will
|
| 613 |
+
create the examples.
|
| 614 |
+
**gen_kwargs (additional keyword arguments):
|
| 615 |
+
Keyword arguments to forward to the `DatasetBuilder._generate_examples` method
|
| 616 |
+
of the builder.
|
| 617 |
+
|
| 618 |
+
Example:
|
| 619 |
+
|
| 620 |
+
```py
|
| 621 |
+
>>> datasets.SplitGenerator(
|
| 622 |
+
... name=datasets.Split.TRAIN,
|
| 623 |
+
... gen_kwargs={"split_key": "train", "files": dl_manager.download_and_extract(url)},
|
| 624 |
+
... )
|
| 625 |
+
```
|
| 626 |
+
"""
|
| 627 |
+
|
| 628 |
+
name: str
|
| 629 |
+
gen_kwargs: Dict = dataclasses.field(default_factory=dict)
|
| 630 |
+
split_info: SplitInfo = dataclasses.field(init=False)
|
| 631 |
+
|
| 632 |
+
def __post_init__(self):
|
| 633 |
+
self.name = str(self.name) # Make sure we convert NamedSplits in strings
|
| 634 |
+
NamedSplit(self.name) # check that it's a valid split name
|
| 635 |
+
self.split_info = SplitInfo(name=self.name)
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/datasets/table.py
ADDED
|
@@ -0,0 +1,2422 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import copy
|
| 2 |
+
import os
|
| 3 |
+
from functools import partial
|
| 4 |
+
from itertools import groupby
|
| 5 |
+
from typing import TYPE_CHECKING, Any, Callable, Iterator, List, Optional, Tuple, TypeVar, Union
|
| 6 |
+
|
| 7 |
+
import numpy as np
|
| 8 |
+
import pyarrow as pa
|
| 9 |
+
import pyarrow.compute as pc
|
| 10 |
+
import pyarrow.types
|
| 11 |
+
|
| 12 |
+
from . import config
|
| 13 |
+
from .utils.logging import get_logger
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
if TYPE_CHECKING:
|
| 17 |
+
from .features.features import Features, FeatureType
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
logger = get_logger(__name__)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def inject_arrow_table_documentation(arrow_table_method):
|
| 24 |
+
def wrapper(fn):
|
| 25 |
+
fn.__doc__ = arrow_table_method.__doc__ + (fn.__doc__ if fn.__doc__ is not None else "")
|
| 26 |
+
fn.__doc__ = fn.__doc__.replace("pyarrow.Table", "Table")
|
| 27 |
+
if hasattr(arrow_table_method, "__annotations__"):
|
| 28 |
+
fn.__annotations__ = arrow_table_method.__annotations__
|
| 29 |
+
return fn
|
| 30 |
+
|
| 31 |
+
return wrapper
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def _in_memory_arrow_table_from_file(filename: str) -> pa.Table:
|
| 35 |
+
in_memory_stream = pa.input_stream(filename)
|
| 36 |
+
opened_stream = pa.ipc.open_stream(in_memory_stream)
|
| 37 |
+
pa_table = opened_stream.read_all()
|
| 38 |
+
return pa_table
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def _in_memory_arrow_table_from_buffer(buffer: pa.Buffer) -> pa.Table:
|
| 42 |
+
stream = pa.BufferReader(buffer)
|
| 43 |
+
opened_stream = pa.ipc.open_stream(stream)
|
| 44 |
+
table = opened_stream.read_all()
|
| 45 |
+
return table
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def _memory_mapped_record_batch_reader_from_file(filename: str) -> pa.RecordBatchStreamReader:
|
| 49 |
+
memory_mapped_stream = pa.memory_map(filename)
|
| 50 |
+
return pa.ipc.open_stream(memory_mapped_stream)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def read_schema_from_file(filename: str) -> pa.Schema:
|
| 54 |
+
"""
|
| 55 |
+
Infer arrow table schema from file without loading whole file into memory.
|
| 56 |
+
Usefull especially while having very big files.
|
| 57 |
+
"""
|
| 58 |
+
with pa.memory_map(filename) as memory_mapped_stream:
|
| 59 |
+
schema = pa.ipc.open_stream(memory_mapped_stream).schema
|
| 60 |
+
return schema
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def _memory_mapped_arrow_table_from_file(filename: str) -> pa.Table:
|
| 64 |
+
opened_stream = _memory_mapped_record_batch_reader_from_file(filename)
|
| 65 |
+
pa_table = opened_stream.read_all()
|
| 66 |
+
return pa_table
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def _deepcopy(x, memo: dict):
|
| 70 |
+
"""deepcopy a regular class instance"""
|
| 71 |
+
cls = x.__class__
|
| 72 |
+
result = cls.__new__(cls)
|
| 73 |
+
memo[id(x)] = result
|
| 74 |
+
for k, v in x.__dict__.items():
|
| 75 |
+
setattr(result, k, copy.deepcopy(v, memo))
|
| 76 |
+
return result
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def _interpolation_search(arr: List[int], x: int) -> int:
|
| 80 |
+
"""
|
| 81 |
+
Return the position i of a sorted array so that arr[i] <= x < arr[i+1]
|
| 82 |
+
|
| 83 |
+
Args:
|
| 84 |
+
arr (`List[int]`): non-empty sorted list of integers
|
| 85 |
+
x (`int`): query
|
| 86 |
+
|
| 87 |
+
Returns:
|
| 88 |
+
`int`: the position i so that arr[i] <= x < arr[i+1]
|
| 89 |
+
|
| 90 |
+
Raises:
|
| 91 |
+
`IndexError`: if the array is empty or if the query is outside the array values
|
| 92 |
+
"""
|
| 93 |
+
i, j = 0, len(arr) - 1
|
| 94 |
+
while i < j and arr[i] <= x < arr[j]:
|
| 95 |
+
k = i + ((j - i) * (x - arr[i]) // (arr[j] - arr[i]))
|
| 96 |
+
if arr[k] <= x < arr[k + 1]:
|
| 97 |
+
return k
|
| 98 |
+
elif arr[k] < x:
|
| 99 |
+
i, j = k + 1, j
|
| 100 |
+
else:
|
| 101 |
+
i, j = i, k
|
| 102 |
+
raise IndexError(f"Invalid query '{x}' for size {arr[-1] if len(arr) else 'none'}.")
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
class IndexedTableMixin:
|
| 106 |
+
def __init__(self, table: pa.Table):
|
| 107 |
+
self._schema: pa.Schema = table.schema
|
| 108 |
+
self._batches: List[pa.RecordBatch] = [
|
| 109 |
+
recordbatch for recordbatch in table.to_batches() if len(recordbatch) > 0
|
| 110 |
+
]
|
| 111 |
+
self._offsets: np.ndarray = np.cumsum([0] + [len(b) for b in self._batches], dtype=np.int64)
|
| 112 |
+
|
| 113 |
+
def fast_gather(self, indices: Union[List[int], np.ndarray]) -> pa.Table:
|
| 114 |
+
"""
|
| 115 |
+
Create a pa.Table by gathering the records at the records at the specified indices. Should be faster
|
| 116 |
+
than pa.concat_tables(table.fast_slice(int(i) % table.num_rows, 1) for i in indices) since NumPy can compute
|
| 117 |
+
the binary searches in parallel, highly optimized C
|
| 118 |
+
"""
|
| 119 |
+
if not len(indices):
|
| 120 |
+
raise ValueError("Indices must be non-empty")
|
| 121 |
+
batch_indices = np.searchsorted(self._offsets, indices, side="right") - 1
|
| 122 |
+
return pa.Table.from_batches(
|
| 123 |
+
[
|
| 124 |
+
self._batches[batch_idx].slice(i - self._offsets[batch_idx], 1)
|
| 125 |
+
for batch_idx, i in zip(batch_indices, indices)
|
| 126 |
+
],
|
| 127 |
+
schema=self._schema,
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
def fast_slice(self, offset=0, length=None) -> pa.Table:
|
| 131 |
+
"""
|
| 132 |
+
Slice the Table using interpolation search.
|
| 133 |
+
The behavior is the same as `pyarrow.Table.slice` but it's significantly faster.
|
| 134 |
+
|
| 135 |
+
Interpolation search is used to find the start and end indexes of the batches we want to keep.
|
| 136 |
+
The batches to keep are then concatenated to form the sliced Table.
|
| 137 |
+
"""
|
| 138 |
+
if offset < 0:
|
| 139 |
+
raise IndexError("Offset must be non-negative")
|
| 140 |
+
elif offset >= self._offsets[-1] or (length is not None and length <= 0):
|
| 141 |
+
return pa.Table.from_batches([], schema=self._schema)
|
| 142 |
+
i = _interpolation_search(self._offsets, offset)
|
| 143 |
+
if length is None or length + offset >= self._offsets[-1]:
|
| 144 |
+
batches = self._batches[i:]
|
| 145 |
+
batches[0] = batches[0].slice(offset - self._offsets[i])
|
| 146 |
+
else:
|
| 147 |
+
j = _interpolation_search(self._offsets, offset + length - 1)
|
| 148 |
+
batches = self._batches[i : j + 1]
|
| 149 |
+
batches[-1] = batches[-1].slice(0, offset + length - self._offsets[j])
|
| 150 |
+
batches[0] = batches[0].slice(offset - self._offsets[i])
|
| 151 |
+
return pa.Table.from_batches(batches, schema=self._schema)
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
class Table(IndexedTableMixin):
|
| 155 |
+
"""
|
| 156 |
+
Wraps a pyarrow Table by using composition.
|
| 157 |
+
This is the base class for `InMemoryTable`, `MemoryMappedTable` and `ConcatenationTable`.
|
| 158 |
+
|
| 159 |
+
It implements all the basic attributes/methods of the pyarrow Table class except
|
| 160 |
+
the Table transforms: `slice, filter, flatten, combine_chunks, cast, add_column,
|
| 161 |
+
append_column, remove_column, set_column, rename_columns` and `drop`.
|
| 162 |
+
|
| 163 |
+
The implementation of these methods differs for the subclasses.
|
| 164 |
+
"""
|
| 165 |
+
|
| 166 |
+
def __init__(self, table: pa.Table):
|
| 167 |
+
super().__init__(table)
|
| 168 |
+
self.table = table
|
| 169 |
+
|
| 170 |
+
def __deepcopy__(self, memo: dict):
|
| 171 |
+
# arrow tables are immutable, so there's no need to copy self.table
|
| 172 |
+
# moreover calling deepcopy on a pyarrow table seems to make pa.total_allocated_bytes() decrease for some reason
|
| 173 |
+
# by adding it to the memo, self.table won't be copied
|
| 174 |
+
memo[id(self.table)] = self.table
|
| 175 |
+
# same for the recordbatches used by the index
|
| 176 |
+
memo[id(self._batches)] = list(self._batches)
|
| 177 |
+
return _deepcopy(self, memo)
|
| 178 |
+
|
| 179 |
+
def validate(self, *args, **kwargs):
|
| 180 |
+
"""
|
| 181 |
+
Perform validation checks. An exception is raised if validation fails.
|
| 182 |
+
|
| 183 |
+
By default only cheap validation checks are run. Pass `full=True`
|
| 184 |
+
for thorough validation checks (potentially `O(n)`).
|
| 185 |
+
|
| 186 |
+
Args:
|
| 187 |
+
full (`bool`, defaults to `False`):
|
| 188 |
+
If `True`, run expensive checks, otherwise cheap checks only.
|
| 189 |
+
|
| 190 |
+
Raises:
|
| 191 |
+
`pa.lib.ArrowInvalid`: if validation fails
|
| 192 |
+
"""
|
| 193 |
+
return self.table.validate(*args, **kwargs)
|
| 194 |
+
|
| 195 |
+
def equals(self, *args, **kwargs):
|
| 196 |
+
"""
|
| 197 |
+
Check if contents of two tables are equal.
|
| 198 |
+
|
| 199 |
+
Args:
|
| 200 |
+
other ([`~datasets.table.Table`]):
|
| 201 |
+
Table to compare against.
|
| 202 |
+
check_metadata `bool`, defaults to `False`):
|
| 203 |
+
Whether schema metadata equality should be checked as well.
|
| 204 |
+
|
| 205 |
+
Returns:
|
| 206 |
+
`bool`
|
| 207 |
+
"""
|
| 208 |
+
args = tuple(arg.table if isinstance(arg, Table) else arg for arg in args)
|
| 209 |
+
kwargs = {k: v.table if isinstance(v, Table) else v for k, v in kwargs}
|
| 210 |
+
return self.table.equals(*args, **kwargs)
|
| 211 |
+
|
| 212 |
+
def to_batches(self, *args, **kwargs):
|
| 213 |
+
"""
|
| 214 |
+
Convert Table to list of (contiguous) `RecordBatch` objects.
|
| 215 |
+
|
| 216 |
+
Args:
|
| 217 |
+
max_chunksize (`int`, defaults to `None`):
|
| 218 |
+
Maximum size for `RecordBatch` chunks. Individual chunks may be
|
| 219 |
+
smaller depending on the chunk layout of individual columns.
|
| 220 |
+
|
| 221 |
+
Returns:
|
| 222 |
+
`List[pyarrow.RecordBatch]`
|
| 223 |
+
"""
|
| 224 |
+
return self.table.to_batches(*args, **kwargs)
|
| 225 |
+
|
| 226 |
+
def to_pydict(self, *args, **kwargs):
|
| 227 |
+
"""
|
| 228 |
+
Convert the Table to a `dict` or `OrderedDict`.
|
| 229 |
+
|
| 230 |
+
Returns:
|
| 231 |
+
`dict`
|
| 232 |
+
"""
|
| 233 |
+
return self.table.to_pydict(*args, **kwargs)
|
| 234 |
+
|
| 235 |
+
def to_pylist(self, *args, **kwargs):
|
| 236 |
+
"""
|
| 237 |
+
Convert the Table to a list
|
| 238 |
+
|
| 239 |
+
Returns:
|
| 240 |
+
`list`
|
| 241 |
+
"""
|
| 242 |
+
return self.table.to_pylist(*args, **kwargs)
|
| 243 |
+
|
| 244 |
+
def to_pandas(self, *args, **kwargs):
|
| 245 |
+
"""
|
| 246 |
+
Convert to a pandas-compatible NumPy array or DataFrame, as appropriate.
|
| 247 |
+
|
| 248 |
+
Args:
|
| 249 |
+
memory_pool (`MemoryPool`, defaults to `None`):
|
| 250 |
+
Arrow MemoryPool to use for allocations. Uses the default memory
|
| 251 |
+
pool is not passed.
|
| 252 |
+
strings_to_categorical (`bool`, defaults to `False`):
|
| 253 |
+
Encode string (UTF8) and binary types to `pandas.Categorical`.
|
| 254 |
+
categories (`list`, defaults to `empty`):
|
| 255 |
+
List of fields that should be returned as `pandas.Categorical`. Only
|
| 256 |
+
applies to table-like data structures.
|
| 257 |
+
zero_copy_only (`bool`, defaults to `False`):
|
| 258 |
+
Raise an `ArrowException` if this function call would require copying
|
| 259 |
+
the underlying data.
|
| 260 |
+
integer_object_nulls (`bool`, defaults to `False`):
|
| 261 |
+
Cast integers with nulls to objects.
|
| 262 |
+
date_as_object (`bool`, defaults to `True`):
|
| 263 |
+
Cast dates to objects. If `False`, convert to `datetime64[ns]` dtype.
|
| 264 |
+
timestamp_as_object (`bool`, defaults to `False`):
|
| 265 |
+
Cast non-nanosecond timestamps (`np.datetime64`) to objects. This is
|
| 266 |
+
useful if you have timestamps that don't fit in the normal date
|
| 267 |
+
range of nanosecond timestamps (1678 CE-2262 CE).
|
| 268 |
+
If `False`, all timestamps are converted to `datetime64[ns]` dtype.
|
| 269 |
+
use_threads (`bool`, defaults to `True`):
|
| 270 |
+
Whether to parallelize the conversion using multiple threads.
|
| 271 |
+
deduplicate_objects (`bool`, defaults to `False`):
|
| 272 |
+
Do not create multiple copies Python objects when created, to save
|
| 273 |
+
on memory use. Conversion will be slower.
|
| 274 |
+
ignore_metadata (`bool`, defaults to `False`):
|
| 275 |
+
If `True`, do not use the 'pandas' metadata to reconstruct the
|
| 276 |
+
DataFrame index, if present.
|
| 277 |
+
safe (`bool`, defaults to `True`):
|
| 278 |
+
For certain data types, a cast is needed in order to store the
|
| 279 |
+
data in a pandas DataFrame or Series (e.g. timestamps are always
|
| 280 |
+
stored as nanoseconds in pandas). This option controls whether it
|
| 281 |
+
is a safe cast or not.
|
| 282 |
+
split_blocks (`bool`, defaults to `False`):
|
| 283 |
+
If `True`, generate one internal "block" for each column when
|
| 284 |
+
creating a pandas.DataFrame from a `RecordBatch` or `Table`. While this
|
| 285 |
+
can temporarily reduce memory note that various pandas operations
|
| 286 |
+
can trigger "consolidation" which may balloon memory use.
|
| 287 |
+
self_destruct (`bool`, defaults to `False`):
|
| 288 |
+
EXPERIMENTAL: If `True`, attempt to deallocate the originating Arrow
|
| 289 |
+
memory while converting the Arrow object to pandas. If you use the
|
| 290 |
+
object after calling `to_pandas` with this option it will crash your
|
| 291 |
+
program.
|
| 292 |
+
types_mapper (`function`, defaults to `None`):
|
| 293 |
+
A function mapping a pyarrow DataType to a pandas `ExtensionDtype`.
|
| 294 |
+
This can be used to override the default pandas type for conversion
|
| 295 |
+
of built-in pyarrow types or in absence of `pandas_metadata` in the
|
| 296 |
+
Table schema. The function receives a pyarrow DataType and is
|
| 297 |
+
expected to return a pandas `ExtensionDtype` or `None` if the
|
| 298 |
+
default conversion should be used for that type. If you have
|
| 299 |
+
a dictionary mapping, you can pass `dict.get` as function.
|
| 300 |
+
|
| 301 |
+
Returns:
|
| 302 |
+
`pandas.Series` or `pandas.DataFrame`: `pandas.Series` or `pandas.DataFrame` depending on type of object
|
| 303 |
+
"""
|
| 304 |
+
return self.table.to_pandas(*args, **kwargs)
|
| 305 |
+
|
| 306 |
+
def to_string(self, *args, **kwargs):
|
| 307 |
+
return self.table.to_string(*args, **kwargs)
|
| 308 |
+
|
| 309 |
+
def to_reader(self, max_chunksize: Optional[int] = None):
|
| 310 |
+
"""
|
| 311 |
+
Convert the Table to a RecordBatchReader.
|
| 312 |
+
|
| 313 |
+
Note that this method is zero-copy, it merely exposes the same data under a different API.
|
| 314 |
+
|
| 315 |
+
Args:
|
| 316 |
+
max_chunksize (`int`, defaults to `None`)
|
| 317 |
+
Maximum size for RecordBatch chunks. Individual chunks may be smaller depending
|
| 318 |
+
on the chunk layout of individual columns.
|
| 319 |
+
|
| 320 |
+
Returns:
|
| 321 |
+
`pyarrow.RecordBatchReader`
|
| 322 |
+
"""
|
| 323 |
+
return self.table.to_reader(max_chunksize=max_chunksize)
|
| 324 |
+
|
| 325 |
+
def field(self, *args, **kwargs):
|
| 326 |
+
"""
|
| 327 |
+
Select a schema field by its column name or numeric index.
|
| 328 |
+
|
| 329 |
+
Args:
|
| 330 |
+
i (`Union[int, str]`):
|
| 331 |
+
The index or name of the field to retrieve.
|
| 332 |
+
|
| 333 |
+
Returns:
|
| 334 |
+
`pyarrow.Field`
|
| 335 |
+
"""
|
| 336 |
+
return self.table.field(*args, **kwargs)
|
| 337 |
+
|
| 338 |
+
def column(self, *args, **kwargs):
|
| 339 |
+
"""
|
| 340 |
+
Select a column by its column name, or numeric index.
|
| 341 |
+
|
| 342 |
+
Args:
|
| 343 |
+
i (`Union[int, str]`):
|
| 344 |
+
The index or name of the column to retrieve.
|
| 345 |
+
|
| 346 |
+
Returns:
|
| 347 |
+
`pyarrow.ChunkedArray`
|
| 348 |
+
"""
|
| 349 |
+
return self.table.column(*args, **kwargs)
|
| 350 |
+
|
| 351 |
+
def itercolumns(self, *args, **kwargs):
|
| 352 |
+
"""
|
| 353 |
+
Iterator over all columns in their numerical order.
|
| 354 |
+
|
| 355 |
+
Yields:
|
| 356 |
+
`pyarrow.ChunkedArray`
|
| 357 |
+
"""
|
| 358 |
+
return self.table.itercolumns(*args, **kwargs)
|
| 359 |
+
|
| 360 |
+
@property
|
| 361 |
+
def schema(self):
|
| 362 |
+
"""
|
| 363 |
+
Schema of the table and its columns.
|
| 364 |
+
|
| 365 |
+
Returns:
|
| 366 |
+
`pyarrow.Schema`
|
| 367 |
+
"""
|
| 368 |
+
return self.table.schema
|
| 369 |
+
|
| 370 |
+
@property
|
| 371 |
+
def columns(self):
|
| 372 |
+
"""
|
| 373 |
+
List of all columns in numerical order.
|
| 374 |
+
|
| 375 |
+
Returns:
|
| 376 |
+
`List[pa.ChunkedArray]`
|
| 377 |
+
"""
|
| 378 |
+
return self.table.columns
|
| 379 |
+
|
| 380 |
+
@property
|
| 381 |
+
def num_columns(self):
|
| 382 |
+
"""
|
| 383 |
+
Number of columns in this table.
|
| 384 |
+
|
| 385 |
+
Returns:
|
| 386 |
+
int
|
| 387 |
+
"""
|
| 388 |
+
return self.table.num_columns
|
| 389 |
+
|
| 390 |
+
@property
|
| 391 |
+
def num_rows(self):
|
| 392 |
+
"""
|
| 393 |
+
Number of rows in this table.
|
| 394 |
+
|
| 395 |
+
Due to the definition of a table, all columns have the same number of
|
| 396 |
+
rows.
|
| 397 |
+
|
| 398 |
+
Returns:
|
| 399 |
+
int
|
| 400 |
+
"""
|
| 401 |
+
return self.table.num_rows
|
| 402 |
+
|
| 403 |
+
@property
|
| 404 |
+
def shape(self):
|
| 405 |
+
"""
|
| 406 |
+
Dimensions of the table: (#rows, #columns).
|
| 407 |
+
|
| 408 |
+
Returns:
|
| 409 |
+
`(int, int)`: Number of rows and number of columns.
|
| 410 |
+
"""
|
| 411 |
+
return self.table.shape
|
| 412 |
+
|
| 413 |
+
@property
|
| 414 |
+
def nbytes(self):
|
| 415 |
+
"""
|
| 416 |
+
Total number of bytes consumed by the elements of the table.
|
| 417 |
+
"""
|
| 418 |
+
return self.table.nbytes
|
| 419 |
+
|
| 420 |
+
@property
|
| 421 |
+
def column_names(self):
|
| 422 |
+
"""
|
| 423 |
+
Names of the table's columns.
|
| 424 |
+
"""
|
| 425 |
+
return self.table.column_names
|
| 426 |
+
|
| 427 |
+
def __eq__(self, other):
|
| 428 |
+
return self.equals(other)
|
| 429 |
+
|
| 430 |
+
def __getitem__(self, i):
|
| 431 |
+
return self.table[i]
|
| 432 |
+
|
| 433 |
+
def __len__(self):
|
| 434 |
+
return len(self.table)
|
| 435 |
+
|
| 436 |
+
def __repr__(self):
|
| 437 |
+
return self.table.__repr__().replace("pyarrow.Table", self.__class__.__name__)
|
| 438 |
+
|
| 439 |
+
def __str__(self):
|
| 440 |
+
return self.table.__str__().replace("pyarrow.Table", self.__class__.__name__)
|
| 441 |
+
|
| 442 |
+
def slice(self, *args, **kwargs):
|
| 443 |
+
"""
|
| 444 |
+
Compute zero-copy slice of this Table.
|
| 445 |
+
|
| 446 |
+
Args:
|
| 447 |
+
offset (`int`, defaults to `0`):
|
| 448 |
+
Offset from start of table to slice.
|
| 449 |
+
length (`int`, defaults to `None`):
|
| 450 |
+
Length of slice (default is until end of table starting from
|
| 451 |
+
offset).
|
| 452 |
+
|
| 453 |
+
Returns:
|
| 454 |
+
`datasets.table.Table`
|
| 455 |
+
"""
|
| 456 |
+
raise NotImplementedError()
|
| 457 |
+
|
| 458 |
+
def filter(self, *args, **kwargs):
|
| 459 |
+
"""
|
| 460 |
+
Select records from a Table. See `pyarrow.compute.filter` for full usage.
|
| 461 |
+
"""
|
| 462 |
+
raise NotImplementedError()
|
| 463 |
+
|
| 464 |
+
def flatten(self, *args, **kwargs):
|
| 465 |
+
"""
|
| 466 |
+
Flatten this Table. Each column with a struct type is flattened
|
| 467 |
+
into one column per struct field. Other columns are left unchanged.
|
| 468 |
+
|
| 469 |
+
Args:
|
| 470 |
+
memory_pool (`MemoryPool`, defaults to `None`):
|
| 471 |
+
For memory allocations, if required, otherwise use default pool.
|
| 472 |
+
|
| 473 |
+
Returns:
|
| 474 |
+
`datasets.table.Table`
|
| 475 |
+
"""
|
| 476 |
+
raise NotImplementedError()
|
| 477 |
+
|
| 478 |
+
def combine_chunks(self, *args, **kwargs):
|
| 479 |
+
"""
|
| 480 |
+
Make a new table by combining the chunks this table has.
|
| 481 |
+
|
| 482 |
+
All the underlying chunks in the `ChunkedArray` of each column are
|
| 483 |
+
concatenated into zero or one chunk.
|
| 484 |
+
|
| 485 |
+
Args:
|
| 486 |
+
memory_pool (`MemoryPool`, defaults to `None`):
|
| 487 |
+
For memory allocations, if required, otherwise use default pool.
|
| 488 |
+
|
| 489 |
+
Returns:
|
| 490 |
+
`datasets.table.Table`
|
| 491 |
+
"""
|
| 492 |
+
raise NotImplementedError()
|
| 493 |
+
|
| 494 |
+
def cast(self, *args, **kwargs):
|
| 495 |
+
"""
|
| 496 |
+
Cast table values to another schema.
|
| 497 |
+
|
| 498 |
+
Args:
|
| 499 |
+
target_schema (`Schema`):
|
| 500 |
+
Schema to cast to, the names and order of fields must match.
|
| 501 |
+
safe (`bool`, defaults to `True`):
|
| 502 |
+
Check for overflows or other unsafe conversions.
|
| 503 |
+
|
| 504 |
+
Returns:
|
| 505 |
+
`datasets.table.Table`
|
| 506 |
+
"""
|
| 507 |
+
raise NotImplementedError()
|
| 508 |
+
|
| 509 |
+
def replace_schema_metadata(self, *args, **kwargs):
|
| 510 |
+
"""
|
| 511 |
+
EXPERIMENTAL: Create shallow copy of table by replacing schema
|
| 512 |
+
key-value metadata with the indicated new metadata (which may be None,
|
| 513 |
+
which deletes any existing metadata
|
| 514 |
+
|
| 515 |
+
Args:
|
| 516 |
+
metadata (`dict`, defaults to `None`):
|
| 517 |
+
|
| 518 |
+
Returns:
|
| 519 |
+
`datasets.table.Table`: shallow_copy
|
| 520 |
+
"""
|
| 521 |
+
raise NotImplementedError()
|
| 522 |
+
|
| 523 |
+
def add_column(self, *args, **kwargs):
|
| 524 |
+
"""
|
| 525 |
+
Add column to Table at position.
|
| 526 |
+
|
| 527 |
+
A new table is returned with the column added, the original table
|
| 528 |
+
object is left unchanged.
|
| 529 |
+
|
| 530 |
+
Args:
|
| 531 |
+
i (`int`):
|
| 532 |
+
Index to place the column at.
|
| 533 |
+
field_ (`Union[str, pyarrow.Field]`):
|
| 534 |
+
If a string is passed then the type is deduced from the column
|
| 535 |
+
data.
|
| 536 |
+
column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
|
| 537 |
+
Column data.
|
| 538 |
+
|
| 539 |
+
Returns:
|
| 540 |
+
`datasets.table.Table`: New table with the passed column added.
|
| 541 |
+
"""
|
| 542 |
+
raise NotImplementedError()
|
| 543 |
+
|
| 544 |
+
def append_column(self, *args, **kwargs):
|
| 545 |
+
"""
|
| 546 |
+
Append column at end of columns.
|
| 547 |
+
|
| 548 |
+
Args:
|
| 549 |
+
field_ (`Union[str, pyarrow.Field]`):
|
| 550 |
+
If a string is passed then the type is deduced from the column
|
| 551 |
+
data.
|
| 552 |
+
column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
|
| 553 |
+
Column data.
|
| 554 |
+
|
| 555 |
+
Returns:
|
| 556 |
+
`datasets.table.Table`: New table with the passed column added.
|
| 557 |
+
"""
|
| 558 |
+
raise NotImplementedError()
|
| 559 |
+
|
| 560 |
+
def remove_column(self, *args, **kwargs):
|
| 561 |
+
"""
|
| 562 |
+
Create new Table with the indicated column removed.
|
| 563 |
+
|
| 564 |
+
Args:
|
| 565 |
+
i (`int`):
|
| 566 |
+
Index of column to remove.
|
| 567 |
+
|
| 568 |
+
Returns:
|
| 569 |
+
`datasets.table.Table`: New table without the column.
|
| 570 |
+
"""
|
| 571 |
+
raise NotImplementedError()
|
| 572 |
+
|
| 573 |
+
def set_column(self, *args, **kwargs):
|
| 574 |
+
"""
|
| 575 |
+
Replace column in Table at position.
|
| 576 |
+
|
| 577 |
+
Args:
|
| 578 |
+
i (`int`):
|
| 579 |
+
Index to place the column at.
|
| 580 |
+
field_ (`Union[str, pyarrow.Field]`):
|
| 581 |
+
If a string is passed then the type is deduced from the column
|
| 582 |
+
data.
|
| 583 |
+
column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
|
| 584 |
+
Column data.
|
| 585 |
+
|
| 586 |
+
Returns:
|
| 587 |
+
`datasets.table.Table`: New table with the passed column set.
|
| 588 |
+
"""
|
| 589 |
+
raise NotImplementedError()
|
| 590 |
+
|
| 591 |
+
def rename_columns(self, *args, **kwargs):
|
| 592 |
+
"""
|
| 593 |
+
Create new table with columns renamed to provided names.
|
| 594 |
+
"""
|
| 595 |
+
raise NotImplementedError()
|
| 596 |
+
|
| 597 |
+
def drop(self, *args, **kwargs):
|
| 598 |
+
"""
|
| 599 |
+
Drop one or more columns and return a new table.
|
| 600 |
+
|
| 601 |
+
Args:
|
| 602 |
+
columns (`List[str]`):
|
| 603 |
+
List of field names referencing existing columns.
|
| 604 |
+
|
| 605 |
+
Raises:
|
| 606 |
+
`KeyError` : if any of the passed columns name are not existing.
|
| 607 |
+
|
| 608 |
+
Returns:
|
| 609 |
+
`datasets.table.Table`: New table without the columns.
|
| 610 |
+
"""
|
| 611 |
+
raise NotImplementedError()
|
| 612 |
+
|
| 613 |
+
def select(self, *args, **kwargs):
|
| 614 |
+
"""
|
| 615 |
+
Select columns of the table.
|
| 616 |
+
|
| 617 |
+
Returns a new table with the specified columns, and metadata preserved.
|
| 618 |
+
|
| 619 |
+
Args:
|
| 620 |
+
columns (:obj:`Union[List[str], List[int]]`):
|
| 621 |
+
The column names or integer indices to select.
|
| 622 |
+
|
| 623 |
+
Returns:
|
| 624 |
+
`datasets.table.Table`: table with only a subset of the columns
|
| 625 |
+
"""
|
| 626 |
+
raise NotImplementedError()
|
| 627 |
+
|
| 628 |
+
|
| 629 |
+
class TableBlock(Table):
|
| 630 |
+
"""
|
| 631 |
+
`TableBlock` is the allowed class inside a `ConcanetationTable`.
|
| 632 |
+
Only `MemoryMappedTable` and `InMemoryTable` are `TableBlock`.
|
| 633 |
+
This is because we don't want a `ConcanetationTable` made out of other `ConcanetationTables`.
|
| 634 |
+
"""
|
| 635 |
+
|
| 636 |
+
pass
|
| 637 |
+
|
| 638 |
+
|
| 639 |
+
class InMemoryTable(TableBlock):
|
| 640 |
+
"""
|
| 641 |
+
The table is said in-memory when it is loaded into the user's RAM.
|
| 642 |
+
|
| 643 |
+
Pickling it does copy all the data using memory.
|
| 644 |
+
Its implementation is simple and uses the underlying pyarrow Table methods directly.
|
| 645 |
+
|
| 646 |
+
This is different from the `MemoryMapped` table, for which pickling doesn't copy all the
|
| 647 |
+
data in memory. For a `MemoryMapped`, unpickling instead reloads the table from the disk.
|
| 648 |
+
|
| 649 |
+
`InMemoryTable` must be used when data fit in memory, while `MemoryMapped` are reserved for
|
| 650 |
+
data bigger than memory or when you want the memory footprint of your application to
|
| 651 |
+
stay low.
|
| 652 |
+
"""
|
| 653 |
+
|
| 654 |
+
@classmethod
|
| 655 |
+
def from_file(cls, filename: str):
|
| 656 |
+
table = _in_memory_arrow_table_from_file(filename)
|
| 657 |
+
return cls(table)
|
| 658 |
+
|
| 659 |
+
@classmethod
|
| 660 |
+
def from_buffer(cls, buffer: pa.Buffer):
|
| 661 |
+
table = _in_memory_arrow_table_from_buffer(buffer)
|
| 662 |
+
return cls(table)
|
| 663 |
+
|
| 664 |
+
@classmethod
|
| 665 |
+
def from_pandas(cls, *args, **kwargs):
|
| 666 |
+
"""
|
| 667 |
+
Convert pandas.DataFrame to an Arrow Table.
|
| 668 |
+
|
| 669 |
+
The column types in the resulting Arrow Table are inferred from the
|
| 670 |
+
dtypes of the pandas.Series in the DataFrame. In the case of non-object
|
| 671 |
+
Series, the NumPy dtype is translated to its Arrow equivalent. In the
|
| 672 |
+
case of `object`, we need to guess the datatype by looking at the
|
| 673 |
+
Python objects in this Series.
|
| 674 |
+
|
| 675 |
+
Be aware that Series of the `object` dtype don't carry enough
|
| 676 |
+
information to always lead to a meaningful Arrow type. In the case that
|
| 677 |
+
we cannot infer a type, e.g. because the DataFrame is of length 0 or
|
| 678 |
+
the Series only contains `None/nan` objects, the type is set to
|
| 679 |
+
null. This behavior can be avoided by constructing an explicit schema
|
| 680 |
+
and passing it to this function.
|
| 681 |
+
|
| 682 |
+
Args:
|
| 683 |
+
df (`pandas.DataFrame`):
|
| 684 |
+
schema (`pyarrow.Schema`, *optional*):
|
| 685 |
+
The expected schema of the Arrow Table. This can be used to
|
| 686 |
+
indicate the type of columns if we cannot infer it automatically.
|
| 687 |
+
If passed, the output will have exactly this schema. Columns
|
| 688 |
+
specified in the schema that are not found in the DataFrame columns
|
| 689 |
+
or its index will raise an error. Additional columns or index
|
| 690 |
+
levels in the DataFrame which are not specified in the schema will
|
| 691 |
+
be ignored.
|
| 692 |
+
preserve_index (`bool`, *optional*):
|
| 693 |
+
Whether to store the index as an additional column in the resulting
|
| 694 |
+
`Table`. The default of None will store the index as a column,
|
| 695 |
+
except for RangeIndex which is stored as metadata only. Use
|
| 696 |
+
`preserve_index=True` to force it to be stored as a column.
|
| 697 |
+
nthreads (`int`, defaults to `None` (may use up to system CPU count threads))
|
| 698 |
+
If greater than 1, convert columns to Arrow in parallel using
|
| 699 |
+
indicated number of threads.
|
| 700 |
+
columns (`List[str]`, *optional*):
|
| 701 |
+
List of column to be converted. If `None`, use all columns.
|
| 702 |
+
safe (`bool`, defaults to `True`):
|
| 703 |
+
Check for overflows or other unsafe conversions,
|
| 704 |
+
|
| 705 |
+
Returns:
|
| 706 |
+
`datasets.table.Table`:
|
| 707 |
+
|
| 708 |
+
Examples:
|
| 709 |
+
```python
|
| 710 |
+
>>> import pandas as pd
|
| 711 |
+
>>> import pyarrow as pa
|
| 712 |
+
>>> df = pd.DataFrame({
|
| 713 |
+
... 'int': [1, 2],
|
| 714 |
+
... 'str': ['a', 'b']
|
| 715 |
+
... })
|
| 716 |
+
>>> pa.Table.from_pandas(df)
|
| 717 |
+
<pyarrow.lib.Table object at 0x7f05d1fb1b40>
|
| 718 |
+
```
|
| 719 |
+
"""
|
| 720 |
+
return cls(pa.Table.from_pandas(*args, **kwargs))
|
| 721 |
+
|
| 722 |
+
@classmethod
|
| 723 |
+
def from_arrays(cls, *args, **kwargs):
|
| 724 |
+
"""
|
| 725 |
+
Construct a Table from Arrow arrays.
|
| 726 |
+
|
| 727 |
+
Args:
|
| 728 |
+
arrays (`List[Union[pyarrow.Array, pyarrow.ChunkedArray]]`):
|
| 729 |
+
Equal-length arrays that should form the table.
|
| 730 |
+
names (`List[str]`, *optional*):
|
| 731 |
+
Names for the table columns. If not passed, schema must be passed.
|
| 732 |
+
schema (`Schema`, defaults to `None`):
|
| 733 |
+
Schema for the created table. If not passed, names must be passed.
|
| 734 |
+
metadata (`Union[dict, Mapping]`, defaults to `None`):
|
| 735 |
+
Optional metadata for the schema (if inferred).
|
| 736 |
+
|
| 737 |
+
Returns:
|
| 738 |
+
`datasets.table.Table`
|
| 739 |
+
"""
|
| 740 |
+
return cls(pa.Table.from_arrays(*args, **kwargs))
|
| 741 |
+
|
| 742 |
+
@classmethod
|
| 743 |
+
def from_pydict(cls, *args, **kwargs):
|
| 744 |
+
"""
|
| 745 |
+
Construct a Table from Arrow arrays or columns.
|
| 746 |
+
|
| 747 |
+
Args:
|
| 748 |
+
mapping (`Union[dict, Mapping]`):
|
| 749 |
+
A mapping of strings to Arrays or Python lists.
|
| 750 |
+
schema (`Schema`, defaults to `None`):
|
| 751 |
+
If not passed, will be inferred from the Mapping values
|
| 752 |
+
metadata (`Union[dict, Mapping]`, defaults to `None`):
|
| 753 |
+
Optional metadata for the schema (if inferred).
|
| 754 |
+
|
| 755 |
+
Returns:
|
| 756 |
+
`datasets.table.Table`
|
| 757 |
+
"""
|
| 758 |
+
return cls(pa.Table.from_pydict(*args, **kwargs))
|
| 759 |
+
|
| 760 |
+
@classmethod
|
| 761 |
+
def from_pylist(cls, mapping, *args, **kwargs):
|
| 762 |
+
"""
|
| 763 |
+
Construct a Table from list of rows / dictionaries.
|
| 764 |
+
|
| 765 |
+
Args:
|
| 766 |
+
mapping (`List[dict]`):
|
| 767 |
+
A mapping of strings to row values.
|
| 768 |
+
schema (`Schema`, defaults to `None`):
|
| 769 |
+
If not passed, will be inferred from the Mapping values
|
| 770 |
+
metadata (`Union[dict, Mapping]`, defaults to `None`):
|
| 771 |
+
Optional metadata for the schema (if inferred).
|
| 772 |
+
|
| 773 |
+
Returns:
|
| 774 |
+
`datasets.table.Table`
|
| 775 |
+
"""
|
| 776 |
+
return cls(pa.Table.from_pylist(mapping, *args, **kwargs))
|
| 777 |
+
|
| 778 |
+
@classmethod
|
| 779 |
+
def from_batches(cls, *args, **kwargs):
|
| 780 |
+
"""
|
| 781 |
+
Construct a Table from a sequence or iterator of Arrow `RecordBatches`.
|
| 782 |
+
|
| 783 |
+
Args:
|
| 784 |
+
batches (`Union[Sequence[pyarrow.RecordBatch], Iterator[pyarrow.RecordBatch]]`):
|
| 785 |
+
Sequence of `RecordBatch` to be converted, all schemas must be equal.
|
| 786 |
+
schema (`Schema`, defaults to `None`):
|
| 787 |
+
If not passed, will be inferred from the first `RecordBatch`.
|
| 788 |
+
|
| 789 |
+
Returns:
|
| 790 |
+
`datasets.table.Table`:
|
| 791 |
+
"""
|
| 792 |
+
return cls(pa.Table.from_batches(*args, **kwargs))
|
| 793 |
+
|
| 794 |
+
def slice(self, offset=0, length=None):
|
| 795 |
+
"""
|
| 796 |
+
Compute zero-copy slice of this Table.
|
| 797 |
+
|
| 798 |
+
Args:
|
| 799 |
+
offset (`int`, defaults to `0`):
|
| 800 |
+
Offset from start of table to slice.
|
| 801 |
+
length (`int`, defaults to `None`):
|
| 802 |
+
Length of slice (default is until end of table starting from
|
| 803 |
+
offset).
|
| 804 |
+
|
| 805 |
+
Returns:
|
| 806 |
+
`datasets.table.Table`
|
| 807 |
+
"""
|
| 808 |
+
# Use fast slicing here
|
| 809 |
+
return InMemoryTable(self.fast_slice(offset=offset, length=length))
|
| 810 |
+
|
| 811 |
+
def filter(self, *args, **kwargs):
|
| 812 |
+
"""
|
| 813 |
+
Select records from a Table. See `pyarrow.compute.filter` for full usage.
|
| 814 |
+
"""
|
| 815 |
+
return InMemoryTable(self.table.filter(*args, **kwargs))
|
| 816 |
+
|
| 817 |
+
def flatten(self, *args, **kwargs):
|
| 818 |
+
"""
|
| 819 |
+
Flatten this Table. Each column with a struct type is flattened
|
| 820 |
+
into one column per struct field. Other columns are left unchanged.
|
| 821 |
+
|
| 822 |
+
Args:
|
| 823 |
+
memory_pool (`MemoryPool`, defaults to `None`):
|
| 824 |
+
For memory allocations, if required, otherwise use default pool.
|
| 825 |
+
|
| 826 |
+
Returns:
|
| 827 |
+
`datasets.table.Table`
|
| 828 |
+
"""
|
| 829 |
+
return InMemoryTable(table_flatten(self.table, *args, **kwargs))
|
| 830 |
+
|
| 831 |
+
def combine_chunks(self, *args, **kwargs):
|
| 832 |
+
"""
|
| 833 |
+
Make a new table by combining the chunks this table has.
|
| 834 |
+
|
| 835 |
+
All the underlying chunks in the `ChunkedArray` of each column are
|
| 836 |
+
concatenated into zero or one chunk.
|
| 837 |
+
|
| 838 |
+
Args:
|
| 839 |
+
memory_pool (`MemoryPool`, defaults to `None`):
|
| 840 |
+
For memory allocations, if required, otherwise use default pool.
|
| 841 |
+
|
| 842 |
+
Returns:
|
| 843 |
+
`datasets.table.Table`
|
| 844 |
+
"""
|
| 845 |
+
return InMemoryTable(self.table.combine_chunks(*args, **kwargs))
|
| 846 |
+
|
| 847 |
+
def cast(self, *args, **kwargs):
|
| 848 |
+
"""
|
| 849 |
+
Cast table values to another schema.
|
| 850 |
+
|
| 851 |
+
Args:
|
| 852 |
+
target_schema (`Schema`):
|
| 853 |
+
Schema to cast to, the names and order of fields must match.
|
| 854 |
+
safe (`bool`, defaults to `True`):
|
| 855 |
+
Check for overflows or other unsafe conversions.
|
| 856 |
+
|
| 857 |
+
Returns:
|
| 858 |
+
`datasets.table.Table`
|
| 859 |
+
"""
|
| 860 |
+
return InMemoryTable(table_cast(self.table, *args, **kwargs))
|
| 861 |
+
|
| 862 |
+
def replace_schema_metadata(self, *args, **kwargs):
|
| 863 |
+
"""
|
| 864 |
+
EXPERIMENTAL: Create shallow copy of table by replacing schema
|
| 865 |
+
key-value metadata with the indicated new metadata (which may be `None`,
|
| 866 |
+
which deletes any existing metadata).
|
| 867 |
+
|
| 868 |
+
Args:
|
| 869 |
+
metadata (`dict`, defaults to `None`):
|
| 870 |
+
|
| 871 |
+
Returns:
|
| 872 |
+
`datasets.table.Table`: shallow_copy
|
| 873 |
+
"""
|
| 874 |
+
return InMemoryTable(self.table.replace_schema_metadata(*args, **kwargs))
|
| 875 |
+
|
| 876 |
+
def add_column(self, *args, **kwargs):
|
| 877 |
+
"""
|
| 878 |
+
Add column to Table at position.
|
| 879 |
+
|
| 880 |
+
A new table is returned with the column added, the original table
|
| 881 |
+
object is left unchanged.
|
| 882 |
+
|
| 883 |
+
Args:
|
| 884 |
+
i (`int`):
|
| 885 |
+
Index to place the column at.
|
| 886 |
+
field_ (`Union[str, pyarrow.Field]`):
|
| 887 |
+
If a string is passed then the type is deduced from the column
|
| 888 |
+
data.
|
| 889 |
+
column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
|
| 890 |
+
Column data.
|
| 891 |
+
|
| 892 |
+
Returns:
|
| 893 |
+
`datasets.table.Table`: New table with the passed column added.
|
| 894 |
+
"""
|
| 895 |
+
return InMemoryTable(self.table.add_column(*args, **kwargs))
|
| 896 |
+
|
| 897 |
+
def append_column(self, *args, **kwargs):
|
| 898 |
+
"""
|
| 899 |
+
Append column at end of columns.
|
| 900 |
+
|
| 901 |
+
Args:
|
| 902 |
+
field_ (`Union[str, pyarrow.Field]`):
|
| 903 |
+
If a string is passed then the type is deduced from the column
|
| 904 |
+
data.
|
| 905 |
+
column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
|
| 906 |
+
Column data.
|
| 907 |
+
|
| 908 |
+
Returns:
|
| 909 |
+
`datasets.table.Table`:
|
| 910 |
+
New table with the passed column added.
|
| 911 |
+
"""
|
| 912 |
+
return InMemoryTable(self.table.append_column(*args, **kwargs))
|
| 913 |
+
|
| 914 |
+
def remove_column(self, *args, **kwargs):
|
| 915 |
+
"""
|
| 916 |
+
Create new Table with the indicated column removed.
|
| 917 |
+
|
| 918 |
+
Args:
|
| 919 |
+
i (`int`):
|
| 920 |
+
Index of column to remove.
|
| 921 |
+
|
| 922 |
+
Returns:
|
| 923 |
+
`datasets.table.Table`:
|
| 924 |
+
New table without the column.
|
| 925 |
+
"""
|
| 926 |
+
return InMemoryTable(self.table.remove_column(*args, **kwargs))
|
| 927 |
+
|
| 928 |
+
def set_column(self, *args, **kwargs):
|
| 929 |
+
"""
|
| 930 |
+
Replace column in Table at position.
|
| 931 |
+
|
| 932 |
+
Args:
|
| 933 |
+
i (`int`):
|
| 934 |
+
Index to place the column at.
|
| 935 |
+
field_ (`Union[str, pyarrow.Field]`):
|
| 936 |
+
If a string is passed then the type is deduced from the column
|
| 937 |
+
data.
|
| 938 |
+
column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
|
| 939 |
+
Column data.
|
| 940 |
+
|
| 941 |
+
Returns:
|
| 942 |
+
`datasets.table.Table`:
|
| 943 |
+
New table with the passed column set.
|
| 944 |
+
"""
|
| 945 |
+
return InMemoryTable(self.table.set_column(*args, **kwargs))
|
| 946 |
+
|
| 947 |
+
def rename_columns(self, *args, **kwargs):
|
| 948 |
+
"""
|
| 949 |
+
Create new table with columns renamed to provided names.
|
| 950 |
+
"""
|
| 951 |
+
return InMemoryTable(self.table.rename_columns(*args, **kwargs))
|
| 952 |
+
|
| 953 |
+
def drop(self, *args, **kwargs):
|
| 954 |
+
"""
|
| 955 |
+
Drop one or more columns and return a new table.
|
| 956 |
+
|
| 957 |
+
Args:
|
| 958 |
+
columns (`List[str]`):
|
| 959 |
+
List of field names referencing existing columns.
|
| 960 |
+
|
| 961 |
+
Raises:
|
| 962 |
+
`KeyError` : if any of the passed columns name are not existing.
|
| 963 |
+
|
| 964 |
+
Returns:
|
| 965 |
+
`datasets.table.Table`:
|
| 966 |
+
New table without the columns.
|
| 967 |
+
"""
|
| 968 |
+
return InMemoryTable(self.table.drop(*args, **kwargs))
|
| 969 |
+
|
| 970 |
+
def select(self, *args, **kwargs):
|
| 971 |
+
"""
|
| 972 |
+
Select columns of the table.
|
| 973 |
+
|
| 974 |
+
Returns a new table with the specified columns, and metadata preserved.
|
| 975 |
+
|
| 976 |
+
Args:
|
| 977 |
+
columns (:obj:`Union[List[str], List[int]]`):
|
| 978 |
+
The column names or integer indices to select.
|
| 979 |
+
|
| 980 |
+
Returns:
|
| 981 |
+
:class:`datasets.table.Table`: New table with the specified columns, and metadata preserved.
|
| 982 |
+
"""
|
| 983 |
+
return InMemoryTable(self.table.select(*args, **kwargs))
|
| 984 |
+
|
| 985 |
+
|
| 986 |
+
# The MemoryMappedTable needs replays to properly reload tables from the disk
|
| 987 |
+
Replay = Tuple[str, tuple, dict]
|
| 988 |
+
|
| 989 |
+
|
| 990 |
+
class MemoryMappedTable(TableBlock):
|
| 991 |
+
"""
|
| 992 |
+
The table is said memory mapped when it doesn't use the user's RAM but loads the data
|
| 993 |
+
from the disk instead.
|
| 994 |
+
|
| 995 |
+
Pickling it doesn't copy the data into memory.
|
| 996 |
+
Instead, only the path to the memory mapped arrow file is pickled, as well as the list
|
| 997 |
+
of transforms to "replay" when reloading the table from the disk.
|
| 998 |
+
|
| 999 |
+
Its implementation requires to store an history of all the transforms that were applied
|
| 1000 |
+
to the underlying pyarrow Table, so that they can be "replayed" when reloading the Table
|
| 1001 |
+
from the disk.
|
| 1002 |
+
|
| 1003 |
+
This is different from the `InMemoryTable` table, for which pickling does copy all the
|
| 1004 |
+
data in memory.
|
| 1005 |
+
|
| 1006 |
+
`InMemoryTable` must be used when data fit in memory, while `MemoryMapped` are reserved for
|
| 1007 |
+
data bigger than memory or when you want the memory footprint of your application to
|
| 1008 |
+
stay low.
|
| 1009 |
+
"""
|
| 1010 |
+
|
| 1011 |
+
def __init__(self, table: pa.Table, path: str, replays: Optional[List[Replay]] = None):
|
| 1012 |
+
super().__init__(table)
|
| 1013 |
+
self.path = os.path.abspath(path)
|
| 1014 |
+
self.replays: List[Replay] = replays if replays is not None else []
|
| 1015 |
+
|
| 1016 |
+
@classmethod
|
| 1017 |
+
def from_file(cls, filename: str, replays=None):
|
| 1018 |
+
table = _memory_mapped_arrow_table_from_file(filename)
|
| 1019 |
+
table = cls._apply_replays(table, replays)
|
| 1020 |
+
return cls(table, filename, replays)
|
| 1021 |
+
|
| 1022 |
+
def __getstate__(self):
|
| 1023 |
+
return {"path": self.path, "replays": self.replays}
|
| 1024 |
+
|
| 1025 |
+
def __setstate__(self, state):
|
| 1026 |
+
path = state["path"]
|
| 1027 |
+
replays = state["replays"]
|
| 1028 |
+
table = _memory_mapped_arrow_table_from_file(path)
|
| 1029 |
+
table = self._apply_replays(table, replays)
|
| 1030 |
+
MemoryMappedTable.__init__(self, table, path=path, replays=replays)
|
| 1031 |
+
|
| 1032 |
+
@staticmethod
|
| 1033 |
+
def _apply_replays(table: pa.Table, replays: Optional[List[Replay]] = None) -> pa.Table:
|
| 1034 |
+
if replays is not None:
|
| 1035 |
+
for name, args, kwargs in replays:
|
| 1036 |
+
if name == "cast":
|
| 1037 |
+
table = table_cast(table, *args, **kwargs)
|
| 1038 |
+
elif name == "flatten":
|
| 1039 |
+
table = table_flatten(table, *args, **kwargs)
|
| 1040 |
+
else:
|
| 1041 |
+
table = getattr(table, name)(*args, **kwargs)
|
| 1042 |
+
return table
|
| 1043 |
+
|
| 1044 |
+
def _append_replay(self, replay: Replay) -> List[Replay]:
|
| 1045 |
+
replays = copy.deepcopy(self.replays)
|
| 1046 |
+
replays.append(replay)
|
| 1047 |
+
return replays
|
| 1048 |
+
|
| 1049 |
+
def slice(self, offset=0, length=None):
|
| 1050 |
+
"""
|
| 1051 |
+
Compute zero-copy slice of this Table.
|
| 1052 |
+
|
| 1053 |
+
Args:
|
| 1054 |
+
offset (`int`, defaults to `0`):
|
| 1055 |
+
Offset from start of table to slice.
|
| 1056 |
+
length (`int`, defaults to `None`):
|
| 1057 |
+
Length of slice (default is until end of table starting from
|
| 1058 |
+
offset).
|
| 1059 |
+
|
| 1060 |
+
Returns:
|
| 1061 |
+
`datasets.table.Table`
|
| 1062 |
+
"""
|
| 1063 |
+
replay = ("slice", (offset, length), {})
|
| 1064 |
+
replays = self._append_replay(replay)
|
| 1065 |
+
# Use fast slicing here
|
| 1066 |
+
return MemoryMappedTable(self.fast_slice(offset=offset, length=length), self.path, replays)
|
| 1067 |
+
|
| 1068 |
+
def filter(self, *args, **kwargs):
|
| 1069 |
+
"""
|
| 1070 |
+
Select records from a Table. See `pyarrow.compute.filter` for full usage.
|
| 1071 |
+
"""
|
| 1072 |
+
replay = ("filter", copy.deepcopy(args), copy.deepcopy(kwargs))
|
| 1073 |
+
replays = self._append_replay(replay)
|
| 1074 |
+
return MemoryMappedTable(self.table.filter(*args, **kwargs), self.path, replays)
|
| 1075 |
+
|
| 1076 |
+
def flatten(self, *args, **kwargs):
|
| 1077 |
+
"""
|
| 1078 |
+
Flatten this Table. Each column with a struct type is flattened
|
| 1079 |
+
into one column per struct field. Other columns are left unchanged.
|
| 1080 |
+
|
| 1081 |
+
Args:
|
| 1082 |
+
memory_pool (`MemoryPool`, defaults to `None`):
|
| 1083 |
+
For memory allocations, if required, otherwise use default pool.
|
| 1084 |
+
|
| 1085 |
+
Returns:
|
| 1086 |
+
`datasets.table.Table`
|
| 1087 |
+
"""
|
| 1088 |
+
replay = ("flatten", copy.deepcopy(args), copy.deepcopy(kwargs))
|
| 1089 |
+
replays = self._append_replay(replay)
|
| 1090 |
+
return MemoryMappedTable(table_flatten(self.table, *args, **kwargs), self.path, replays)
|
| 1091 |
+
|
| 1092 |
+
def combine_chunks(self, *args, **kwargs):
|
| 1093 |
+
"""
|
| 1094 |
+
Make a new table by combining the chunks this table has.
|
| 1095 |
+
|
| 1096 |
+
All the underlying chunks in the ChunkedArray of each column are
|
| 1097 |
+
concatenated into zero or one chunk.
|
| 1098 |
+
|
| 1099 |
+
Args:
|
| 1100 |
+
memory_pool (`MemoryPool`, defaults to `None`):
|
| 1101 |
+
For memory allocations, if required, otherwise use default pool.
|
| 1102 |
+
|
| 1103 |
+
Returns:
|
| 1104 |
+
`datasets.table.Table`
|
| 1105 |
+
"""
|
| 1106 |
+
replay = ("combine_chunks", copy.deepcopy(args), copy.deepcopy(kwargs))
|
| 1107 |
+
replays = self._append_replay(replay)
|
| 1108 |
+
return MemoryMappedTable(self.table.combine_chunks(*args, **kwargs), self.path, replays)
|
| 1109 |
+
|
| 1110 |
+
def cast(self, *args, **kwargs):
|
| 1111 |
+
"""
|
| 1112 |
+
Cast table values to another schema
|
| 1113 |
+
|
| 1114 |
+
Args:
|
| 1115 |
+
target_schema (`Schema`):
|
| 1116 |
+
Schema to cast to, the names and order of fields must match.
|
| 1117 |
+
safe (`bool`, defaults to `True`):
|
| 1118 |
+
Check for overflows or other unsafe conversions.
|
| 1119 |
+
|
| 1120 |
+
Returns:
|
| 1121 |
+
`datasets.table.Table`
|
| 1122 |
+
"""
|
| 1123 |
+
replay = ("cast", copy.deepcopy(args), copy.deepcopy(kwargs))
|
| 1124 |
+
replays = self._append_replay(replay)
|
| 1125 |
+
return MemoryMappedTable(table_cast(self.table, *args, **kwargs), self.path, replays)
|
| 1126 |
+
|
| 1127 |
+
def replace_schema_metadata(self, *args, **kwargs):
|
| 1128 |
+
"""
|
| 1129 |
+
EXPERIMENTAL: Create shallow copy of table by replacing schema
|
| 1130 |
+
key-value metadata with the indicated new metadata (which may be None,
|
| 1131 |
+
which deletes any existing metadata.
|
| 1132 |
+
|
| 1133 |
+
Args:
|
| 1134 |
+
metadata (`dict`, defaults to `None`):
|
| 1135 |
+
|
| 1136 |
+
Returns:
|
| 1137 |
+
`datasets.table.Table`: shallow_copy
|
| 1138 |
+
"""
|
| 1139 |
+
replay = ("replace_schema_metadata", copy.deepcopy(args), copy.deepcopy(kwargs))
|
| 1140 |
+
replays = self._append_replay(replay)
|
| 1141 |
+
return MemoryMappedTable(self.table.replace_schema_metadata(*args, **kwargs), self.path, replays)
|
| 1142 |
+
|
| 1143 |
+
def add_column(self, *args, **kwargs):
|
| 1144 |
+
"""
|
| 1145 |
+
Add column to Table at position.
|
| 1146 |
+
|
| 1147 |
+
A new table is returned with the column added, the original table
|
| 1148 |
+
object is left unchanged.
|
| 1149 |
+
|
| 1150 |
+
Args:
|
| 1151 |
+
i (`int`):
|
| 1152 |
+
Index to place the column at.
|
| 1153 |
+
field_ (`Union[str, pyarrow.Field]`):
|
| 1154 |
+
If a string is passed then the type is deduced from the column
|
| 1155 |
+
data.
|
| 1156 |
+
column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
|
| 1157 |
+
Column data.
|
| 1158 |
+
|
| 1159 |
+
Returns:
|
| 1160 |
+
`datasets.table.Table`: New table with the passed column added.
|
| 1161 |
+
"""
|
| 1162 |
+
replay = ("add_column", copy.deepcopy(args), copy.deepcopy(kwargs))
|
| 1163 |
+
replays = self._append_replay(replay)
|
| 1164 |
+
return MemoryMappedTable(self.table.add_column(*args, **kwargs), self.path, replays)
|
| 1165 |
+
|
| 1166 |
+
def append_column(self, *args, **kwargs):
|
| 1167 |
+
"""
|
| 1168 |
+
Append column at end of columns.
|
| 1169 |
+
|
| 1170 |
+
Args:
|
| 1171 |
+
field_ (`Union[str, pyarrow.Field]`):
|
| 1172 |
+
If a string is passed then the type is deduced from the column
|
| 1173 |
+
data.
|
| 1174 |
+
column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
|
| 1175 |
+
Column data.
|
| 1176 |
+
|
| 1177 |
+
Returns:
|
| 1178 |
+
`datasets.table.Table`:
|
| 1179 |
+
New table with the passed column added.
|
| 1180 |
+
"""
|
| 1181 |
+
replay = ("append_column", copy.deepcopy(args), copy.deepcopy(kwargs))
|
| 1182 |
+
replays = self._append_replay(replay)
|
| 1183 |
+
return MemoryMappedTable(self.table.append_column(*args, **kwargs), self.path, replays)
|
| 1184 |
+
|
| 1185 |
+
def remove_column(self, *args, **kwargs):
|
| 1186 |
+
"""
|
| 1187 |
+
Create new Table with the indicated column removed.
|
| 1188 |
+
|
| 1189 |
+
Args:
|
| 1190 |
+
i (`int`):
|
| 1191 |
+
Index of column to remove.
|
| 1192 |
+
|
| 1193 |
+
Returns:
|
| 1194 |
+
`datasets.table.Table`:
|
| 1195 |
+
New table without the column.
|
| 1196 |
+
"""
|
| 1197 |
+
replay = ("remove_column", copy.deepcopy(args), copy.deepcopy(kwargs))
|
| 1198 |
+
replays = self._append_replay(replay)
|
| 1199 |
+
return MemoryMappedTable(self.table.remove_column(*args, **kwargs), self.path, replays)
|
| 1200 |
+
|
| 1201 |
+
def set_column(self, *args, **kwargs):
|
| 1202 |
+
"""
|
| 1203 |
+
Replace column in Table at position.
|
| 1204 |
+
|
| 1205 |
+
Args:
|
| 1206 |
+
i (`int`):
|
| 1207 |
+
Index to place the column at.
|
| 1208 |
+
field_ (`Union[str, pyarrow.Field]`):
|
| 1209 |
+
If a string is passed then the type is deduced from the column
|
| 1210 |
+
data.
|
| 1211 |
+
column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
|
| 1212 |
+
Column data.
|
| 1213 |
+
|
| 1214 |
+
Returns:
|
| 1215 |
+
`datasets.table.Table`:
|
| 1216 |
+
New table with the passed column set.
|
| 1217 |
+
"""
|
| 1218 |
+
replay = ("set_column", copy.deepcopy(args), copy.deepcopy(kwargs))
|
| 1219 |
+
replays = self._append_replay(replay)
|
| 1220 |
+
return MemoryMappedTable(self.table.set_column(*args, **kwargs), self.path, replays)
|
| 1221 |
+
|
| 1222 |
+
def rename_columns(self, *args, **kwargs):
|
| 1223 |
+
"""
|
| 1224 |
+
Create new table with columns renamed to provided names.
|
| 1225 |
+
"""
|
| 1226 |
+
replay = ("rename_columns", copy.deepcopy(args), copy.deepcopy(kwargs))
|
| 1227 |
+
replays = self._append_replay(replay)
|
| 1228 |
+
return MemoryMappedTable(self.table.rename_columns(*args, **kwargs), self.path, replays)
|
| 1229 |
+
|
| 1230 |
+
def drop(self, *args, **kwargs):
|
| 1231 |
+
"""
|
| 1232 |
+
Drop one or more columns and return a new table.
|
| 1233 |
+
|
| 1234 |
+
Args:
|
| 1235 |
+
columns (`List[str]`):
|
| 1236 |
+
List of field names referencing existing columns.
|
| 1237 |
+
|
| 1238 |
+
Raises:
|
| 1239 |
+
`KeyError` : if any of the passed columns name are not existing.
|
| 1240 |
+
|
| 1241 |
+
Returns:
|
| 1242 |
+
`datasets.table.Table`:
|
| 1243 |
+
New table without the columns.
|
| 1244 |
+
"""
|
| 1245 |
+
replay = ("drop", copy.deepcopy(args), copy.deepcopy(kwargs))
|
| 1246 |
+
replays = self._append_replay(replay)
|
| 1247 |
+
return MemoryMappedTable(self.table.drop(*args, **kwargs), self.path, replays)
|
| 1248 |
+
|
| 1249 |
+
def select(self, *args, **kwargs):
|
| 1250 |
+
"""
|
| 1251 |
+
Select columns of the table.
|
| 1252 |
+
|
| 1253 |
+
Returns a new table with the specified columns, and metadata preserved.
|
| 1254 |
+
|
| 1255 |
+
Args:
|
| 1256 |
+
columns (:obj:`Union[List[str], List[int]]`):
|
| 1257 |
+
The column names or integer indices to select.
|
| 1258 |
+
|
| 1259 |
+
Returns:
|
| 1260 |
+
:class:`datasets.table.Table`: New table with the specified columns, and metadata preserved.
|
| 1261 |
+
"""
|
| 1262 |
+
replay = ("select", copy.deepcopy(args), copy.deepcopy(kwargs))
|
| 1263 |
+
replays = self._append_replay(replay)
|
| 1264 |
+
return MemoryMappedTable(self.table.select(*args, **kwargs), self.path, replays)
|
| 1265 |
+
|
| 1266 |
+
|
| 1267 |
+
# A ConcatenationTable is the concatenation of several tables.
|
| 1268 |
+
# The ``blocks`` attributes stores a list of list of blocks.
|
| 1269 |
+
# The first axis concatenates the tables along the axis 0 (it appends rows),
|
| 1270 |
+
# while the second axis concatenates tables along the axis 1 (it appends columns).
|
| 1271 |
+
TableBlockContainer = TypeVar("TableBlockContainer", TableBlock, List[TableBlock], List[List[TableBlock]])
|
| 1272 |
+
|
| 1273 |
+
|
| 1274 |
+
class ConcatenationTable(Table):
|
| 1275 |
+
"""
|
| 1276 |
+
The table comes from the concatenation of several tables called blocks.
|
| 1277 |
+
It enables concatenation on both axis 0 (append rows) and axis 1 (append columns).
|
| 1278 |
+
|
| 1279 |
+
The underlying tables are called "blocks" and can be either `InMemoryTable`
|
| 1280 |
+
or `MemoryMappedTable` objects.
|
| 1281 |
+
This allows to combine tables that come from memory or that are memory mapped.
|
| 1282 |
+
When a `ConcatenationTable` is pickled, then each block is pickled:
|
| 1283 |
+
- the `InMemoryTable` objects are pickled by copying all the data in memory.
|
| 1284 |
+
- the MemoryMappedTable objects are pickled without copying the data into memory.
|
| 1285 |
+
Instead, only the path to the memory mapped arrow file is pickled, as well as the list
|
| 1286 |
+
of transforms to "replays" when reloading the table from the disk.
|
| 1287 |
+
|
| 1288 |
+
Its implementation requires to store each block separately.
|
| 1289 |
+
The `blocks` attributes stores a list of list of blocks.
|
| 1290 |
+
The first axis concatenates the tables along the axis 0 (it appends rows),
|
| 1291 |
+
while the second axis concatenates tables along the axis 1 (it appends columns).
|
| 1292 |
+
|
| 1293 |
+
If some columns are missing when concatenating on axis 0, they are filled with null values.
|
| 1294 |
+
This is done using `pyarrow.concat_tables(tables, promote=True)`.
|
| 1295 |
+
|
| 1296 |
+
You can access the fully combined table by accessing the `ConcatenationTable.table` attribute,
|
| 1297 |
+
and the blocks by accessing the `ConcatenationTable.blocks` attribute.
|
| 1298 |
+
"""
|
| 1299 |
+
|
| 1300 |
+
def __init__(self, table: pa.Table, blocks: List[List[TableBlock]]):
|
| 1301 |
+
super().__init__(table)
|
| 1302 |
+
self.blocks = blocks
|
| 1303 |
+
# Check that all the blocks have the right type.
|
| 1304 |
+
# Only InMemoryTable and MemoryMappedTable are allowed.
|
| 1305 |
+
for subtables in blocks:
|
| 1306 |
+
for subtable in subtables:
|
| 1307 |
+
if not isinstance(subtable, TableBlock):
|
| 1308 |
+
raise TypeError(
|
| 1309 |
+
"The blocks of a ConcatenationTable must be InMemoryTable or MemoryMappedTable objects"
|
| 1310 |
+
f", but got {_short_str(subtable)}."
|
| 1311 |
+
)
|
| 1312 |
+
|
| 1313 |
+
def __getstate__(self):
|
| 1314 |
+
return {"blocks": self.blocks, "schema": self.table.schema}
|
| 1315 |
+
|
| 1316 |
+
def __setstate__(self, state):
|
| 1317 |
+
blocks = state["blocks"]
|
| 1318 |
+
schema = state["schema"]
|
| 1319 |
+
table = self._concat_blocks_horizontally_and_vertically(blocks)
|
| 1320 |
+
if schema is not None and table.schema != schema:
|
| 1321 |
+
# We fix the columns by concatenating with an empty table with the right columns
|
| 1322 |
+
empty_table = pa.Table.from_batches([], schema=schema)
|
| 1323 |
+
# we set promote=True to fill missing columns with null values
|
| 1324 |
+
if config.PYARROW_VERSION.major < 14:
|
| 1325 |
+
table = pa.concat_tables([table, empty_table], promote=True)
|
| 1326 |
+
else:
|
| 1327 |
+
table = pa.concat_tables([table, empty_table], promote_options="default")
|
| 1328 |
+
ConcatenationTable.__init__(self, table, blocks=blocks)
|
| 1329 |
+
|
| 1330 |
+
@staticmethod
|
| 1331 |
+
def _concat_blocks(blocks: List[Union[TableBlock, pa.Table]], axis: int = 0) -> pa.Table:
|
| 1332 |
+
pa_tables = [table.table if hasattr(table, "table") else table for table in blocks]
|
| 1333 |
+
if axis == 0:
|
| 1334 |
+
# we set promote=True to fill missing columns with null values
|
| 1335 |
+
if config.PYARROW_VERSION.major < 14:
|
| 1336 |
+
return pa.concat_tables(pa_tables, promote=True)
|
| 1337 |
+
else:
|
| 1338 |
+
return pa.concat_tables(pa_tables, promote_options="default")
|
| 1339 |
+
elif axis == 1:
|
| 1340 |
+
for i, table in enumerate(pa_tables):
|
| 1341 |
+
if i == 0:
|
| 1342 |
+
pa_table = table
|
| 1343 |
+
else:
|
| 1344 |
+
for name, col in zip(table.column_names, table.columns):
|
| 1345 |
+
pa_table = pa_table.append_column(name, col)
|
| 1346 |
+
return pa_table
|
| 1347 |
+
else:
|
| 1348 |
+
raise ValueError("'axis' must be either 0 or 1")
|
| 1349 |
+
|
| 1350 |
+
@classmethod
|
| 1351 |
+
def _concat_blocks_horizontally_and_vertically(cls, blocks: List[List[TableBlock]]) -> pa.Table:
|
| 1352 |
+
pa_tables_to_concat_vertically = []
|
| 1353 |
+
for i, tables in enumerate(blocks):
|
| 1354 |
+
if not tables:
|
| 1355 |
+
continue
|
| 1356 |
+
pa_table_horizontally_concatenated = cls._concat_blocks(tables, axis=1)
|
| 1357 |
+
pa_tables_to_concat_vertically.append(pa_table_horizontally_concatenated)
|
| 1358 |
+
return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)
|
| 1359 |
+
|
| 1360 |
+
@classmethod
|
| 1361 |
+
def _merge_blocks(cls, blocks: TableBlockContainer, axis: Optional[int] = None) -> TableBlockContainer:
|
| 1362 |
+
if axis is not None:
|
| 1363 |
+
merged_blocks = []
|
| 1364 |
+
for is_in_memory, block_group in groupby(blocks, key=lambda x: isinstance(x, InMemoryTable)):
|
| 1365 |
+
if is_in_memory:
|
| 1366 |
+
block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
|
| 1367 |
+
merged_blocks += list(block_group)
|
| 1368 |
+
else: # both
|
| 1369 |
+
merged_blocks = [cls._merge_blocks(row_block, axis=1) for row_block in blocks]
|
| 1370 |
+
if all(len(row_block) == 1 for row_block in merged_blocks):
|
| 1371 |
+
merged_blocks = cls._merge_blocks(
|
| 1372 |
+
[block for row_block in merged_blocks for block in row_block], axis=0
|
| 1373 |
+
)
|
| 1374 |
+
return merged_blocks
|
| 1375 |
+
|
| 1376 |
+
@classmethod
|
| 1377 |
+
def _consolidate_blocks(cls, blocks: TableBlockContainer) -> TableBlockContainer:
|
| 1378 |
+
if isinstance(blocks, TableBlock):
|
| 1379 |
+
return blocks
|
| 1380 |
+
elif isinstance(blocks[0], TableBlock):
|
| 1381 |
+
return cls._merge_blocks(blocks, axis=0)
|
| 1382 |
+
else:
|
| 1383 |
+
return cls._merge_blocks(blocks)
|
| 1384 |
+
|
| 1385 |
+
@classmethod
|
| 1386 |
+
def from_blocks(cls, blocks: TableBlockContainer) -> "ConcatenationTable":
|
| 1387 |
+
blocks = cls._consolidate_blocks(blocks)
|
| 1388 |
+
if isinstance(blocks, TableBlock):
|
| 1389 |
+
table = blocks
|
| 1390 |
+
return cls(table.table, [[table]])
|
| 1391 |
+
elif isinstance(blocks[0], TableBlock):
|
| 1392 |
+
table = cls._concat_blocks(blocks, axis=0)
|
| 1393 |
+
blocks = [[t] for t in blocks]
|
| 1394 |
+
return cls(table, blocks)
|
| 1395 |
+
else:
|
| 1396 |
+
table = cls._concat_blocks_horizontally_and_vertically(blocks)
|
| 1397 |
+
return cls(table, blocks)
|
| 1398 |
+
|
| 1399 |
+
@classmethod
|
| 1400 |
+
def from_tables(cls, tables: List[Union[pa.Table, Table]], axis: int = 0) -> "ConcatenationTable":
|
| 1401 |
+
"""Create `ConcatenationTable` from list of tables.
|
| 1402 |
+
|
| 1403 |
+
Args:
|
| 1404 |
+
tables (list of `Table` or list of `pyarrow.Table`):
|
| 1405 |
+
List of tables.
|
| 1406 |
+
axis (`{0, 1}`, defaults to `0`, meaning over rows):
|
| 1407 |
+
Axis to concatenate over, where `0` means over rows (vertically) and `1` means over columns
|
| 1408 |
+
(horizontally).
|
| 1409 |
+
|
| 1410 |
+
<Added version="1.6.0"/>
|
| 1411 |
+
"""
|
| 1412 |
+
|
| 1413 |
+
def to_blocks(table: Union[pa.Table, Table]) -> List[List[TableBlock]]:
|
| 1414 |
+
if isinstance(table, pa.Table):
|
| 1415 |
+
return [[InMemoryTable(table)]]
|
| 1416 |
+
elif isinstance(table, ConcatenationTable):
|
| 1417 |
+
return copy.deepcopy(table.blocks)
|
| 1418 |
+
else:
|
| 1419 |
+
return [[table]]
|
| 1420 |
+
|
| 1421 |
+
def _slice_row_block(row_block: List[TableBlock], length: int) -> Tuple[List[TableBlock], List[TableBlock]]:
|
| 1422 |
+
sliced = [table.slice(0, length) for table in row_block]
|
| 1423 |
+
remainder = [table.slice(length, len(row_block[0]) - length) for table in row_block]
|
| 1424 |
+
return sliced, remainder
|
| 1425 |
+
|
| 1426 |
+
def _split_both_like(
|
| 1427 |
+
result: List[List[TableBlock]], blocks: List[List[TableBlock]]
|
| 1428 |
+
) -> Tuple[List[List[TableBlock]], List[List[TableBlock]]]:
|
| 1429 |
+
"""
|
| 1430 |
+
Make sure each row_block contain the same num_rows to be able to concatenate them on axis=1.
|
| 1431 |
+
|
| 1432 |
+
To do so, we modify both blocks sets to have the same row_blocks boundaries.
|
| 1433 |
+
For example, if `result` has 2 row_blocks of 3 rows and `blocks` has 3 row_blocks of 2 rows,
|
| 1434 |
+
we modify both to have 4 row_blocks of size 2, 1, 1 and 2:
|
| 1435 |
+
|
| 1436 |
+
[ x x x | x x x ]
|
| 1437 |
+
+ [ y y | y y | y y ]
|
| 1438 |
+
-----------------------------
|
| 1439 |
+
= [ x x | x | x | x x ]
|
| 1440 |
+
[ y y | y | y | y y ]
|
| 1441 |
+
|
| 1442 |
+
"""
|
| 1443 |
+
result, blocks = list(result), list(blocks)
|
| 1444 |
+
new_result, new_blocks = [], []
|
| 1445 |
+
while result and blocks:
|
| 1446 |
+
# we slice the longest row block to save two row blocks of same length
|
| 1447 |
+
# and we replace the long row block by its remainder if necessary
|
| 1448 |
+
if len(result[0][0]) > len(blocks[0][0]):
|
| 1449 |
+
new_blocks.append(blocks[0])
|
| 1450 |
+
sliced, result[0] = _slice_row_block(result[0], len(blocks.pop(0)[0]))
|
| 1451 |
+
new_result.append(sliced)
|
| 1452 |
+
elif len(result[0][0]) < len(blocks[0][0]):
|
| 1453 |
+
new_result.append(result[0])
|
| 1454 |
+
sliced, blocks[0] = _slice_row_block(blocks[0], len(result.pop(0)[0]))
|
| 1455 |
+
new_blocks.append(sliced)
|
| 1456 |
+
else:
|
| 1457 |
+
new_result.append(result.pop(0))
|
| 1458 |
+
new_blocks.append(blocks.pop(0))
|
| 1459 |
+
if result or blocks:
|
| 1460 |
+
raise ValueError("Failed to concatenate on axis=1 because tables don't have the same number of rows")
|
| 1461 |
+
return new_result, new_blocks
|
| 1462 |
+
|
| 1463 |
+
def _extend_blocks(
|
| 1464 |
+
result: List[List[TableBlock]], blocks: List[List[TableBlock]], axis: int = 0
|
| 1465 |
+
) -> List[List[TableBlock]]:
|
| 1466 |
+
if axis == 0:
|
| 1467 |
+
result.extend(blocks)
|
| 1468 |
+
elif axis == 1:
|
| 1469 |
+
# We make sure each row_block have the same num_rows
|
| 1470 |
+
result, blocks = _split_both_like(result, blocks)
|
| 1471 |
+
for i, row_block in enumerate(blocks):
|
| 1472 |
+
result[i].extend(row_block)
|
| 1473 |
+
return result
|
| 1474 |
+
|
| 1475 |
+
blocks = to_blocks(tables[0])
|
| 1476 |
+
for table in tables[1:]:
|
| 1477 |
+
table_blocks = to_blocks(table)
|
| 1478 |
+
blocks = _extend_blocks(blocks, table_blocks, axis=axis)
|
| 1479 |
+
return cls.from_blocks(blocks)
|
| 1480 |
+
|
| 1481 |
+
@property
|
| 1482 |
+
def _slices(self):
|
| 1483 |
+
offset = 0
|
| 1484 |
+
for tables in self.blocks:
|
| 1485 |
+
length = len(tables[0])
|
| 1486 |
+
yield (offset, length)
|
| 1487 |
+
offset += length
|
| 1488 |
+
|
| 1489 |
+
def slice(self, offset=0, length=None):
|
| 1490 |
+
"""
|
| 1491 |
+
Compute zero-copy slice of this Table.
|
| 1492 |
+
|
| 1493 |
+
Args:
|
| 1494 |
+
offset (`int`, defaults to `0`):
|
| 1495 |
+
Offset from start of table to slice.
|
| 1496 |
+
length (`int`, defaults to `None`):
|
| 1497 |
+
Length of slice (default is until end of table starting from
|
| 1498 |
+
offset).
|
| 1499 |
+
|
| 1500 |
+
Returns:
|
| 1501 |
+
`datasets.table.Table`
|
| 1502 |
+
"""
|
| 1503 |
+
table = self.table.slice(offset, length=length)
|
| 1504 |
+
length = length if length is not None else self.num_rows - offset
|
| 1505 |
+
blocks = []
|
| 1506 |
+
for tables in self.blocks:
|
| 1507 |
+
n_rows = len(tables[0])
|
| 1508 |
+
if length == 0:
|
| 1509 |
+
break
|
| 1510 |
+
elif n_rows <= offset:
|
| 1511 |
+
offset = offset - n_rows
|
| 1512 |
+
elif n_rows <= offset + length:
|
| 1513 |
+
blocks.append([t.slice(offset) for t in tables])
|
| 1514 |
+
length, offset = length + offset - n_rows, 0
|
| 1515 |
+
else:
|
| 1516 |
+
blocks.append([t.slice(offset, length) for t in tables])
|
| 1517 |
+
length, offset = 0, 0
|
| 1518 |
+
return ConcatenationTable(table, blocks)
|
| 1519 |
+
|
| 1520 |
+
def filter(self, mask, *args, **kwargs):
|
| 1521 |
+
"""
|
| 1522 |
+
Select records from a Table. See `pyarrow.compute.filter` for full usage.
|
| 1523 |
+
"""
|
| 1524 |
+
table = self.table.filter(mask, *args, **kwargs)
|
| 1525 |
+
blocks = []
|
| 1526 |
+
for (offset, length), tables in zip(self._slices, self.blocks):
|
| 1527 |
+
submask = mask.slice(offset, length)
|
| 1528 |
+
blocks.append([t.filter(submask, *args, **kwargs) for t in tables])
|
| 1529 |
+
return ConcatenationTable(table, blocks)
|
| 1530 |
+
|
| 1531 |
+
def flatten(self, *args, **kwargs):
|
| 1532 |
+
"""
|
| 1533 |
+
Flatten this Table. Each column with a struct type is flattened
|
| 1534 |
+
into one column per struct field. Other columns are left unchanged.
|
| 1535 |
+
|
| 1536 |
+
Args:
|
| 1537 |
+
memory_pool (`MemoryPool`, defaults to `None`):
|
| 1538 |
+
For memory allocations, if required, otherwise use default pool.
|
| 1539 |
+
|
| 1540 |
+
Returns:
|
| 1541 |
+
`datasets.table.Table`
|
| 1542 |
+
"""
|
| 1543 |
+
table = table_flatten(self.table, *args, **kwargs)
|
| 1544 |
+
blocks = []
|
| 1545 |
+
for tables in self.blocks:
|
| 1546 |
+
blocks.append([t.flatten(*args, **kwargs) for t in tables])
|
| 1547 |
+
return ConcatenationTable(table, blocks)
|
| 1548 |
+
|
| 1549 |
+
def combine_chunks(self, *args, **kwargs):
|
| 1550 |
+
"""
|
| 1551 |
+
Make a new table by combining the chunks this table has.
|
| 1552 |
+
|
| 1553 |
+
All the underlying chunks in the `ChunkedArray` of each column are
|
| 1554 |
+
concatenated into zero or one chunk.
|
| 1555 |
+
|
| 1556 |
+
Args:
|
| 1557 |
+
memory_pool (`MemoryPool`, defaults to `None`):
|
| 1558 |
+
For memory allocations, if required, otherwise use default pool.
|
| 1559 |
+
|
| 1560 |
+
Returns:
|
| 1561 |
+
`datasets.table.Table`
|
| 1562 |
+
"""
|
| 1563 |
+
table = self.table.combine_chunks(*args, **kwargs)
|
| 1564 |
+
blocks = []
|
| 1565 |
+
for tables in self.blocks:
|
| 1566 |
+
blocks.append([t.combine_chunks(*args, **kwargs) for t in tables])
|
| 1567 |
+
return ConcatenationTable(table, blocks)
|
| 1568 |
+
|
| 1569 |
+
def cast(self, target_schema, *args, **kwargs):
|
| 1570 |
+
"""
|
| 1571 |
+
Cast table values to another schema.
|
| 1572 |
+
|
| 1573 |
+
Args:
|
| 1574 |
+
target_schema (`Schema`):
|
| 1575 |
+
Schema to cast to, the names and order of fields must match.
|
| 1576 |
+
safe (`bool`, defaults to `True`):
|
| 1577 |
+
Check for overflows or other unsafe conversions.
|
| 1578 |
+
|
| 1579 |
+
Returns:
|
| 1580 |
+
`datasets.table.Table`
|
| 1581 |
+
"""
|
| 1582 |
+
from .features import Features
|
| 1583 |
+
|
| 1584 |
+
table = table_cast(self.table, target_schema, *args, **kwargs)
|
| 1585 |
+
target_features = Features.from_arrow_schema(target_schema)
|
| 1586 |
+
blocks = []
|
| 1587 |
+
for subtables in self.blocks:
|
| 1588 |
+
new_tables = []
|
| 1589 |
+
fields = list(target_schema)
|
| 1590 |
+
for subtable in subtables:
|
| 1591 |
+
subfields = []
|
| 1592 |
+
for name in subtable.column_names:
|
| 1593 |
+
subfields.append(fields.pop(next(i for i, field in enumerate(fields) if field.name == name)))
|
| 1594 |
+
subfeatures = Features({subfield.name: target_features[subfield.name] for subfield in subfields})
|
| 1595 |
+
subschema = subfeatures.arrow_schema
|
| 1596 |
+
new_tables.append(subtable.cast(subschema, *args, **kwargs))
|
| 1597 |
+
blocks.append(new_tables)
|
| 1598 |
+
return ConcatenationTable(table, blocks)
|
| 1599 |
+
|
| 1600 |
+
def replace_schema_metadata(self, *args, **kwargs):
|
| 1601 |
+
"""
|
| 1602 |
+
EXPERIMENTAL: Create shallow copy of table by replacing schema
|
| 1603 |
+
key-value metadata with the indicated new metadata (which may be `None`,
|
| 1604 |
+
which deletes any existing metadata).
|
| 1605 |
+
|
| 1606 |
+
Args:
|
| 1607 |
+
metadata (`dict`, defaults to `None`):
|
| 1608 |
+
|
| 1609 |
+
Returns:
|
| 1610 |
+
`datasets.table.Table`: shallow_copy
|
| 1611 |
+
"""
|
| 1612 |
+
table = self.table.replace_schema_metadata(*args, **kwargs)
|
| 1613 |
+
blocks = []
|
| 1614 |
+
for tables in self.blocks:
|
| 1615 |
+
blocks.append([t.replace_schema_metadata(*args, **kwargs) for t in tables])
|
| 1616 |
+
return ConcatenationTable(table, self.blocks)
|
| 1617 |
+
|
| 1618 |
+
def add_column(self, *args, **kwargs):
|
| 1619 |
+
"""
|
| 1620 |
+
Add column to Table at position.
|
| 1621 |
+
|
| 1622 |
+
A new table is returned with the column added, the original table
|
| 1623 |
+
object is left unchanged.
|
| 1624 |
+
|
| 1625 |
+
Args:
|
| 1626 |
+
i (`int`):
|
| 1627 |
+
Index to place the column at.
|
| 1628 |
+
field_ (`Union[str, pyarrow.Field]`):
|
| 1629 |
+
If a string is passed then the type is deduced from the column
|
| 1630 |
+
data.
|
| 1631 |
+
column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
|
| 1632 |
+
Column data.
|
| 1633 |
+
|
| 1634 |
+
Returns:
|
| 1635 |
+
`datasets.table.Table`: New table with the passed column added.
|
| 1636 |
+
"""
|
| 1637 |
+
raise NotImplementedError()
|
| 1638 |
+
|
| 1639 |
+
def append_column(self, *args, **kwargs):
|
| 1640 |
+
"""
|
| 1641 |
+
Append column at end of columns.
|
| 1642 |
+
|
| 1643 |
+
Args:
|
| 1644 |
+
field_ (`Union[str, pyarrow.Field]`):
|
| 1645 |
+
If a string is passed then the type is deduced from the column
|
| 1646 |
+
data.
|
| 1647 |
+
column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
|
| 1648 |
+
Column data.
|
| 1649 |
+
|
| 1650 |
+
Returns:
|
| 1651 |
+
`datasets.table.Table`:
|
| 1652 |
+
New table with the passed column added.
|
| 1653 |
+
"""
|
| 1654 |
+
raise NotImplementedError()
|
| 1655 |
+
|
| 1656 |
+
def remove_column(self, i, *args, **kwargs):
|
| 1657 |
+
"""
|
| 1658 |
+
Create new Table with the indicated column removed.
|
| 1659 |
+
|
| 1660 |
+
Args:
|
| 1661 |
+
i (`int`):
|
| 1662 |
+
Index of column to remove.
|
| 1663 |
+
|
| 1664 |
+
Returns:
|
| 1665 |
+
`datasets.table.Table`:
|
| 1666 |
+
New table without the column.
|
| 1667 |
+
"""
|
| 1668 |
+
table = self.table.remove_column(i, *args, **kwargs)
|
| 1669 |
+
name = self.table.column_names[i]
|
| 1670 |
+
blocks = []
|
| 1671 |
+
for tables in self.blocks:
|
| 1672 |
+
blocks.append(
|
| 1673 |
+
[
|
| 1674 |
+
t.remove_column(t.column_names.index(name), *args, **kwargs) if name in t.column_names else t
|
| 1675 |
+
for t in tables
|
| 1676 |
+
]
|
| 1677 |
+
)
|
| 1678 |
+
return ConcatenationTable(table, blocks)
|
| 1679 |
+
|
| 1680 |
+
def set_column(self, *args, **kwargs):
|
| 1681 |
+
"""
|
| 1682 |
+
Replace column in Table at position.
|
| 1683 |
+
|
| 1684 |
+
Args:
|
| 1685 |
+
i (`int`):
|
| 1686 |
+
Index to place the column at.
|
| 1687 |
+
field_ (`Union[str, pyarrow.Field]`):
|
| 1688 |
+
If a string is passed then the type is deduced from the column
|
| 1689 |
+
data.
|
| 1690 |
+
column (`Union[pyarrow.Array, List[pyarrow.Array]]`):
|
| 1691 |
+
Column data.
|
| 1692 |
+
|
| 1693 |
+
Returns:
|
| 1694 |
+
`datasets.table.Table`:
|
| 1695 |
+
New table with the passed column set.
|
| 1696 |
+
"""
|
| 1697 |
+
raise NotImplementedError()
|
| 1698 |
+
|
| 1699 |
+
def rename_columns(self, names, *args, **kwargs):
|
| 1700 |
+
"""
|
| 1701 |
+
Create new table with columns renamed to provided names.
|
| 1702 |
+
"""
|
| 1703 |
+
table = self.table.rename_columns(names, *args, **kwargs)
|
| 1704 |
+
names = dict(zip(self.table.column_names, names))
|
| 1705 |
+
blocks = []
|
| 1706 |
+
for tables in self.blocks:
|
| 1707 |
+
blocks.append(
|
| 1708 |
+
[t.rename_columns([names[name] for name in t.column_names], *args, **kwargs) for t in tables]
|
| 1709 |
+
)
|
| 1710 |
+
return ConcatenationTable(table, blocks)
|
| 1711 |
+
|
| 1712 |
+
def drop(self, columns, *args, **kwargs):
|
| 1713 |
+
"""
|
| 1714 |
+
Drop one or more columns and return a new table.
|
| 1715 |
+
|
| 1716 |
+
Args:
|
| 1717 |
+
columns (`List[str]`):
|
| 1718 |
+
List of field names referencing existing columns.
|
| 1719 |
+
|
| 1720 |
+
Raises:
|
| 1721 |
+
`KeyError` : if any of the passed columns name are not existing.
|
| 1722 |
+
|
| 1723 |
+
Returns:
|
| 1724 |
+
`datasets.table.Table`:
|
| 1725 |
+
New table without the columns.
|
| 1726 |
+
"""
|
| 1727 |
+
table = self.table.drop(columns, *args, **kwargs)
|
| 1728 |
+
blocks = []
|
| 1729 |
+
for tables in self.blocks:
|
| 1730 |
+
blocks.append([t.drop([c for c in columns if c in t.column_names], *args, **kwargs) for t in tables])
|
| 1731 |
+
return ConcatenationTable(table, blocks)
|
| 1732 |
+
|
| 1733 |
+
def select(self, columns, *args, **kwargs):
|
| 1734 |
+
"""
|
| 1735 |
+
Select columns of the table.
|
| 1736 |
+
|
| 1737 |
+
Returns a new table with the specified columns, and metadata preserved.
|
| 1738 |
+
|
| 1739 |
+
Args:
|
| 1740 |
+
columns (:obj:`Union[List[str], List[int]]`):
|
| 1741 |
+
The column names or integer indices to select.
|
| 1742 |
+
|
| 1743 |
+
Returns:
|
| 1744 |
+
:class:`datasets.table.Table`: New table with the specified columns, and metadata preserved.
|
| 1745 |
+
"""
|
| 1746 |
+
table = self.table.select(columns, *args, **kwargs)
|
| 1747 |
+
blocks = []
|
| 1748 |
+
for tables in self.blocks:
|
| 1749 |
+
blocks.append([t.select([c for c in columns if c in t.column_names], *args, **kwargs) for t in tables])
|
| 1750 |
+
return ConcatenationTable(table, blocks)
|
| 1751 |
+
|
| 1752 |
+
|
| 1753 |
+
def concat_tables(tables: List[Table], axis: int = 0) -> Table:
|
| 1754 |
+
"""
|
| 1755 |
+
Concatenate tables.
|
| 1756 |
+
|
| 1757 |
+
Args:
|
| 1758 |
+
tables (list of `Table`):
|
| 1759 |
+
List of tables to be concatenated.
|
| 1760 |
+
axis (`{0, 1}`, defaults to `0`, meaning over rows):
|
| 1761 |
+
Axis to concatenate over, where `0` means over rows (vertically) and `1` means over columns
|
| 1762 |
+
(horizontally).
|
| 1763 |
+
|
| 1764 |
+
<Added version="1.6.0"/>
|
| 1765 |
+
Returns:
|
| 1766 |
+
`datasets.table.Table`:
|
| 1767 |
+
If the number of input tables is > 1, then the returned table is a `datasets.table.ConcatenationTable`.
|
| 1768 |
+
Otherwise if there's only one table, it is returned as is.
|
| 1769 |
+
"""
|
| 1770 |
+
tables = list(tables)
|
| 1771 |
+
if len(tables) == 1:
|
| 1772 |
+
return tables[0]
|
| 1773 |
+
return ConcatenationTable.from_tables(tables, axis=axis)
|
| 1774 |
+
|
| 1775 |
+
|
| 1776 |
+
def list_table_cache_files(table: Table) -> List[str]:
|
| 1777 |
+
"""
|
| 1778 |
+
Get the cache files that are loaded by the table.
|
| 1779 |
+
Cache file are used when parts of the table come from the disk via memory mapping.
|
| 1780 |
+
|
| 1781 |
+
Returns:
|
| 1782 |
+
`List[str]`:
|
| 1783 |
+
A list of paths to the cache files loaded by the table.
|
| 1784 |
+
"""
|
| 1785 |
+
if isinstance(table, ConcatenationTable):
|
| 1786 |
+
cache_files = []
|
| 1787 |
+
for subtables in table.blocks:
|
| 1788 |
+
for subtable in subtables:
|
| 1789 |
+
cache_files += list_table_cache_files(subtable)
|
| 1790 |
+
return cache_files
|
| 1791 |
+
elif isinstance(table, MemoryMappedTable):
|
| 1792 |
+
return [table.path]
|
| 1793 |
+
else:
|
| 1794 |
+
return []
|
| 1795 |
+
|
| 1796 |
+
|
| 1797 |
+
def _wrap_for_chunked_arrays(func):
|
| 1798 |
+
"""Apply the function on each chunk of a `pyarrow.ChunkedArray`, or on the array directly"""
|
| 1799 |
+
|
| 1800 |
+
def wrapper(array, *args, **kwargs):
|
| 1801 |
+
if isinstance(array, pa.ChunkedArray):
|
| 1802 |
+
return pa.chunked_array([func(chunk, *args, **kwargs) for chunk in array.chunks])
|
| 1803 |
+
else:
|
| 1804 |
+
return func(array, *args, **kwargs)
|
| 1805 |
+
|
| 1806 |
+
return wrapper
|
| 1807 |
+
|
| 1808 |
+
|
| 1809 |
+
def _are_list_values_of_length(array: pa.ListArray, length: int) -> bool:
|
| 1810 |
+
"""Check if all the sub-lists of a `pa.ListArray` have the specified length."""
|
| 1811 |
+
return pc.all(pc.equal(array.value_lengths(), length)).as_py() or array.null_count == len(array)
|
| 1812 |
+
|
| 1813 |
+
|
| 1814 |
+
def _combine_list_array_offsets_with_mask(array: pa.ListArray) -> pa.Array:
|
| 1815 |
+
"""Add the null bitmap to the offsets of a `pa.ListArray`."""
|
| 1816 |
+
offsets = array.offsets
|
| 1817 |
+
if array.null_count > 0:
|
| 1818 |
+
offsets = pa.concat_arrays(
|
| 1819 |
+
[
|
| 1820 |
+
pc.replace_with_mask(offsets[:-1], array.is_null(), pa.nulls(len(array), pa.int32())),
|
| 1821 |
+
offsets[-1:],
|
| 1822 |
+
]
|
| 1823 |
+
)
|
| 1824 |
+
return offsets
|
| 1825 |
+
|
| 1826 |
+
|
| 1827 |
+
def _storage_type(type: pa.DataType) -> pa.DataType:
|
| 1828 |
+
"""Convert a (possibly nested) `pa.ExtensionType` to its storage type."""
|
| 1829 |
+
if isinstance(type, pa.ExtensionType):
|
| 1830 |
+
return _storage_type(type.storage_type)
|
| 1831 |
+
elif isinstance(type, pa.StructType):
|
| 1832 |
+
return pa.struct([pa.field(field.name, _storage_type(field.type)) for field in type])
|
| 1833 |
+
elif isinstance(type, pa.ListType):
|
| 1834 |
+
return pa.list_(_storage_type(type.value_type))
|
| 1835 |
+
elif isinstance(type, pa.FixedSizeListType):
|
| 1836 |
+
return pa.list_(_storage_type(type.value_type), type.list_size)
|
| 1837 |
+
return type
|
| 1838 |
+
|
| 1839 |
+
|
| 1840 |
+
def _short_str(value: Any) -> str:
|
| 1841 |
+
out = str(value)
|
| 1842 |
+
if len(out) > 3000:
|
| 1843 |
+
out = out[:1500] + "\n...\n" + out[-1500:]
|
| 1844 |
+
return out
|
| 1845 |
+
|
| 1846 |
+
|
| 1847 |
+
@_wrap_for_chunked_arrays
|
| 1848 |
+
def array_cast(
|
| 1849 |
+
array: pa.Array, pa_type: pa.DataType, allow_primitive_to_str: bool = True, allow_decimal_to_str: bool = True
|
| 1850 |
+
) -> Union[pa.Array, pa.FixedSizeListArray, pa.ListArray, pa.StructArray, pa.ExtensionArray]:
|
| 1851 |
+
"""Improved version of `pa.Array.cast`
|
| 1852 |
+
|
| 1853 |
+
It supports casting `pa.StructArray` objects to re-order the fields.
|
| 1854 |
+
It also let you control certain aspects of the casting, e.g. whether
|
| 1855 |
+
to disable casting primitives (`booleans`, `floats` or `ints`) or
|
| 1856 |
+
disable casting decimals to strings.
|
| 1857 |
+
|
| 1858 |
+
Args:
|
| 1859 |
+
array (`pa.Array`):
|
| 1860 |
+
PyArrow array to cast
|
| 1861 |
+
pa_type (`pa.DataType`):
|
| 1862 |
+
Target PyArrow type
|
| 1863 |
+
allow_primitive_to_str (`bool`, defaults to `True`):
|
| 1864 |
+
Whether to allow casting primitives to strings.
|
| 1865 |
+
Defaults to `True`.
|
| 1866 |
+
allow_decimal_to_str (`bool`, defaults to `True`):
|
| 1867 |
+
Whether to allow casting decimals to strings.
|
| 1868 |
+
Defaults to `True`.
|
| 1869 |
+
|
| 1870 |
+
Raises:
|
| 1871 |
+
`pa.ArrowInvalidError`: if the arrow data casting fails
|
| 1872 |
+
`TypeError`: if the target type is not supported according, e.g.
|
| 1873 |
+
|
| 1874 |
+
- if a field is missing
|
| 1875 |
+
- if casting from primitives to strings and `allow_primitive_to_str` is `False`
|
| 1876 |
+
- if casting from decimals to strings and `allow_decimal_to_str` is `False`
|
| 1877 |
+
|
| 1878 |
+
Returns:
|
| 1879 |
+
`List[pyarrow.Array]`: the casted array
|
| 1880 |
+
"""
|
| 1881 |
+
_c = partial(array_cast, allow_primitive_to_str=allow_primitive_to_str, allow_decimal_to_str=allow_decimal_to_str)
|
| 1882 |
+
if isinstance(array, pa.ExtensionArray):
|
| 1883 |
+
array = array.storage
|
| 1884 |
+
if isinstance(pa_type, pa.ExtensionType):
|
| 1885 |
+
return pa_type.wrap_array(_c(array, pa_type.storage_type))
|
| 1886 |
+
elif array.type == pa_type:
|
| 1887 |
+
return array
|
| 1888 |
+
elif pa.types.is_struct(array.type):
|
| 1889 |
+
if pa.types.is_struct(pa_type) and ({field.name for field in pa_type} == {field.name for field in array.type}):
|
| 1890 |
+
if array.type.num_fields == 0:
|
| 1891 |
+
return array
|
| 1892 |
+
arrays = [_c(array.field(field.name), field.type) for field in pa_type]
|
| 1893 |
+
return pa.StructArray.from_arrays(arrays, fields=list(pa_type), mask=array.is_null())
|
| 1894 |
+
elif pa.types.is_list(array.type):
|
| 1895 |
+
if pa.types.is_fixed_size_list(pa_type):
|
| 1896 |
+
if _are_list_values_of_length(array, pa_type.list_size):
|
| 1897 |
+
if array.null_count > 0:
|
| 1898 |
+
# Ensure each null value in the array translates to [null] * pa_type.list_size in the array's values array
|
| 1899 |
+
array_type = array.type
|
| 1900 |
+
storage_type = _storage_type(array_type)
|
| 1901 |
+
if array_type != storage_type:
|
| 1902 |
+
# Temporarily convert to the storage type to support extension types in the slice operation
|
| 1903 |
+
array = _c(array, storage_type)
|
| 1904 |
+
array = pc.list_slice(array, 0, pa_type.list_size, return_fixed_size_list=True)
|
| 1905 |
+
array = _c(array, array_type)
|
| 1906 |
+
else:
|
| 1907 |
+
array = pc.list_slice(array, 0, pa_type.list_size, return_fixed_size_list=True)
|
| 1908 |
+
array_values = array.values
|
| 1909 |
+
if config.PYARROW_VERSION.major < 15:
|
| 1910 |
+
return pa.Array.from_buffers(
|
| 1911 |
+
pa_type,
|
| 1912 |
+
len(array),
|
| 1913 |
+
[array.is_valid().buffers()[1]],
|
| 1914 |
+
children=[_c(array_values, pa_type.value_type)],
|
| 1915 |
+
)
|
| 1916 |
+
else:
|
| 1917 |
+
return pa.FixedSizeListArray.from_arrays(
|
| 1918 |
+
_c(array_values, pa_type.value_type), pa_type.list_size, mask=array.is_null()
|
| 1919 |
+
)
|
| 1920 |
+
else:
|
| 1921 |
+
array_values = array.values[
|
| 1922 |
+
array.offset * pa_type.length : (array.offset + len(array)) * pa_type.length
|
| 1923 |
+
]
|
| 1924 |
+
return pa.FixedSizeListArray.from_arrays(_c(array_values, pa_type.value_type), pa_type.list_size)
|
| 1925 |
+
elif pa.types.is_list(pa_type):
|
| 1926 |
+
# Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError
|
| 1927 |
+
array_offsets = _combine_list_array_offsets_with_mask(array)
|
| 1928 |
+
return pa.ListArray.from_arrays(array_offsets, _c(array.values, pa_type.value_type))
|
| 1929 |
+
elif pa.types.is_fixed_size_list(array.type):
|
| 1930 |
+
if pa.types.is_fixed_size_list(pa_type):
|
| 1931 |
+
if pa_type.list_size == array.type.list_size:
|
| 1932 |
+
array_values = array.values[
|
| 1933 |
+
array.offset * array.type.list_size : (array.offset + len(array)) * array.type.list_size
|
| 1934 |
+
]
|
| 1935 |
+
if config.PYARROW_VERSION.major < 15:
|
| 1936 |
+
return pa.Array.from_buffers(
|
| 1937 |
+
pa_type,
|
| 1938 |
+
len(array),
|
| 1939 |
+
[array.is_valid().buffers()[1]],
|
| 1940 |
+
children=[_c(array_values, pa_type.value_type)],
|
| 1941 |
+
)
|
| 1942 |
+
else:
|
| 1943 |
+
return pa.FixedSizeListArray.from_arrays(
|
| 1944 |
+
_c(array_values, pa_type.value_type), pa_type.list_size, mask=array.is_null()
|
| 1945 |
+
)
|
| 1946 |
+
elif pa.types.is_list(pa_type):
|
| 1947 |
+
array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size
|
| 1948 |
+
return pa.ListArray.from_arrays(array_offsets, _c(array.values, pa_type.value_type), mask=array.is_null())
|
| 1949 |
+
else:
|
| 1950 |
+
if pa.types.is_string(pa_type):
|
| 1951 |
+
if not allow_primitive_to_str and pa.types.is_primitive(array.type):
|
| 1952 |
+
raise TypeError(
|
| 1953 |
+
f"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)} "
|
| 1954 |
+
f"since allow_primitive_to_str is set to {allow_primitive_to_str} "
|
| 1955 |
+
)
|
| 1956 |
+
if not allow_decimal_to_str and pa.types.is_decimal(array.type):
|
| 1957 |
+
raise TypeError(
|
| 1958 |
+
f"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)} "
|
| 1959 |
+
f"and allow_decimal_to_str is set to {allow_decimal_to_str}"
|
| 1960 |
+
)
|
| 1961 |
+
if pa.types.is_null(pa_type) and not pa.types.is_null(array.type):
|
| 1962 |
+
raise TypeError(f"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)}")
|
| 1963 |
+
return array.cast(pa_type)
|
| 1964 |
+
raise TypeError(f"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)}")
|
| 1965 |
+
|
| 1966 |
+
|
| 1967 |
+
@_wrap_for_chunked_arrays
|
| 1968 |
+
def cast_array_to_feature(
|
| 1969 |
+
array: pa.Array, feature: "FeatureType", allow_primitive_to_str: bool = True, allow_decimal_to_str: bool = True
|
| 1970 |
+
) -> pa.Array:
|
| 1971 |
+
"""Cast an array to the arrow type that corresponds to the requested feature type.
|
| 1972 |
+
For custom features like [`Audio`] or [`Image`], it takes into account the "cast_storage" methods
|
| 1973 |
+
they defined to enable casting from other arrow types.
|
| 1974 |
+
|
| 1975 |
+
Args:
|
| 1976 |
+
array (`pa.Array`):
|
| 1977 |
+
The PyArrow array to cast.
|
| 1978 |
+
feature (`datasets.features.FeatureType`):
|
| 1979 |
+
The target feature type.
|
| 1980 |
+
allow_primitive_to_str (`bool`, defaults to `True`):
|
| 1981 |
+
Whether to allow casting primitives to strings.
|
| 1982 |
+
Defaults to `True`.
|
| 1983 |
+
allow_decimal_to_str (`bool`, defaults to `True`):
|
| 1984 |
+
Whether to allow casting decimals to strings.
|
| 1985 |
+
Defaults to `True`.
|
| 1986 |
+
|
| 1987 |
+
Raises:
|
| 1988 |
+
`pa.ArrowInvalidError`: if the arrow data casting fails
|
| 1989 |
+
`TypeError`: if the target type is not supported according, e.g.
|
| 1990 |
+
|
| 1991 |
+
- if a field is missing
|
| 1992 |
+
- if casting from primitives and `allow_primitive_to_str` is `False`
|
| 1993 |
+
- if casting from decimals and `allow_decimal_to_str` is `False`
|
| 1994 |
+
|
| 1995 |
+
Returns:
|
| 1996 |
+
array (`pyarrow.Array`): the casted array
|
| 1997 |
+
"""
|
| 1998 |
+
from .features.features import Sequence, get_nested_type
|
| 1999 |
+
|
| 2000 |
+
_c = partial(
|
| 2001 |
+
cast_array_to_feature,
|
| 2002 |
+
allow_primitive_to_str=allow_primitive_to_str,
|
| 2003 |
+
allow_decimal_to_str=allow_decimal_to_str,
|
| 2004 |
+
)
|
| 2005 |
+
|
| 2006 |
+
if isinstance(array, pa.ExtensionArray):
|
| 2007 |
+
array = array.storage
|
| 2008 |
+
if hasattr(feature, "cast_storage"):
|
| 2009 |
+
return feature.cast_storage(array)
|
| 2010 |
+
|
| 2011 |
+
elif pa.types.is_struct(array.type):
|
| 2012 |
+
# feature must be a dict or Sequence(subfeatures_dict)
|
| 2013 |
+
if isinstance(feature, Sequence) and isinstance(feature.feature, dict):
|
| 2014 |
+
feature = {
|
| 2015 |
+
name: Sequence(subfeature, length=feature.length) for name, subfeature in feature.feature.items()
|
| 2016 |
+
}
|
| 2017 |
+
if isinstance(feature, dict) and {field.name for field in array.type} == set(feature):
|
| 2018 |
+
if array.type.num_fields == 0:
|
| 2019 |
+
return array
|
| 2020 |
+
arrays = [_c(array.field(name), subfeature) for name, subfeature in feature.items()]
|
| 2021 |
+
return pa.StructArray.from_arrays(arrays, names=list(feature), mask=array.is_null())
|
| 2022 |
+
elif pa.types.is_list(array.type):
|
| 2023 |
+
# feature must be either [subfeature] or Sequence(subfeature)
|
| 2024 |
+
if isinstance(feature, list):
|
| 2025 |
+
casted_array_values = _c(array.values, feature[0])
|
| 2026 |
+
if casted_array_values.type == array.values.type:
|
| 2027 |
+
return array
|
| 2028 |
+
else:
|
| 2029 |
+
# Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError
|
| 2030 |
+
array_offsets = _combine_list_array_offsets_with_mask(array)
|
| 2031 |
+
return pa.ListArray.from_arrays(array_offsets, casted_array_values)
|
| 2032 |
+
elif isinstance(feature, Sequence):
|
| 2033 |
+
if feature.length > -1:
|
| 2034 |
+
if _are_list_values_of_length(array, feature.length):
|
| 2035 |
+
if array.null_count > 0:
|
| 2036 |
+
# Ensure each null value in the array translates to [null] * pa_type.list_size in the array's values array
|
| 2037 |
+
array_type = array.type
|
| 2038 |
+
storage_type = _storage_type(array_type)
|
| 2039 |
+
if array_type != storage_type:
|
| 2040 |
+
# Temporarily convert to the storage type to support extension types in the slice operation
|
| 2041 |
+
array = array_cast(
|
| 2042 |
+
array,
|
| 2043 |
+
storage_type,
|
| 2044 |
+
allow_primitive_to_str=allow_primitive_to_str,
|
| 2045 |
+
allow_decimal_to_str=allow_decimal_to_str,
|
| 2046 |
+
)
|
| 2047 |
+
array = pc.list_slice(array, 0, feature.length, return_fixed_size_list=True)
|
| 2048 |
+
array = array_cast(
|
| 2049 |
+
array,
|
| 2050 |
+
array_type,
|
| 2051 |
+
allow_primitive_to_str=allow_primitive_to_str,
|
| 2052 |
+
allow_decimal_to_str=allow_decimal_to_str,
|
| 2053 |
+
)
|
| 2054 |
+
else:
|
| 2055 |
+
array = pc.list_slice(array, 0, feature.length, return_fixed_size_list=True)
|
| 2056 |
+
array_values = array.values
|
| 2057 |
+
casted_array_values = _c(array_values, feature.feature)
|
| 2058 |
+
if config.PYARROW_VERSION.major < 15:
|
| 2059 |
+
return pa.Array.from_buffers(
|
| 2060 |
+
pa.list_(casted_array_values.type, feature.length),
|
| 2061 |
+
len(array),
|
| 2062 |
+
[array.is_valid().buffers()[1]],
|
| 2063 |
+
children=[casted_array_values],
|
| 2064 |
+
)
|
| 2065 |
+
else:
|
| 2066 |
+
return pa.FixedSizeListArray.from_arrays(
|
| 2067 |
+
casted_array_values, feature.length, mask=array.is_null()
|
| 2068 |
+
)
|
| 2069 |
+
else:
|
| 2070 |
+
array_values = array.values[
|
| 2071 |
+
array.offset * feature.length : (array.offset + len(array)) * feature.length
|
| 2072 |
+
]
|
| 2073 |
+
return pa.FixedSizeListArray.from_arrays(_c(array_values, feature.feature), feature.length)
|
| 2074 |
+
else:
|
| 2075 |
+
casted_array_values = _c(array.values, feature.feature)
|
| 2076 |
+
if casted_array_values.type == array.values.type:
|
| 2077 |
+
return array
|
| 2078 |
+
else:
|
| 2079 |
+
# Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError
|
| 2080 |
+
array_offsets = _combine_list_array_offsets_with_mask(array)
|
| 2081 |
+
return pa.ListArray.from_arrays(array_offsets, casted_array_values)
|
| 2082 |
+
elif pa.types.is_fixed_size_list(array.type):
|
| 2083 |
+
# feature must be either [subfeature] or Sequence(subfeature)
|
| 2084 |
+
if isinstance(feature, list):
|
| 2085 |
+
array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size
|
| 2086 |
+
return pa.ListArray.from_arrays(array_offsets, _c(array.values, feature[0]), mask=array.is_null())
|
| 2087 |
+
elif isinstance(feature, Sequence):
|
| 2088 |
+
if feature.length > -1:
|
| 2089 |
+
if feature.length == array.type.list_size:
|
| 2090 |
+
array_values = array.values[
|
| 2091 |
+
array.offset * array.type.list_size : (array.offset + len(array)) * array.type.list_size
|
| 2092 |
+
]
|
| 2093 |
+
casted_array_values = _c(array_values, feature.feature)
|
| 2094 |
+
if config.PYARROW_VERSION.major < 15:
|
| 2095 |
+
return pa.Array.from_buffers(
|
| 2096 |
+
pa.list_(casted_array_values.type, feature.length),
|
| 2097 |
+
len(array),
|
| 2098 |
+
[array.is_valid().buffers()[1]],
|
| 2099 |
+
children=[casted_array_values],
|
| 2100 |
+
)
|
| 2101 |
+
else:
|
| 2102 |
+
return pa.FixedSizeListArray.from_arrays(
|
| 2103 |
+
casted_array_values, feature.length, mask=array.is_null()
|
| 2104 |
+
)
|
| 2105 |
+
else:
|
| 2106 |
+
array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size
|
| 2107 |
+
return pa.ListArray.from_arrays(array_offsets, _c(array.values, feature.feature), mask=array.is_null())
|
| 2108 |
+
if pa.types.is_null(array.type):
|
| 2109 |
+
return array_cast(
|
| 2110 |
+
array,
|
| 2111 |
+
get_nested_type(feature),
|
| 2112 |
+
allow_primitive_to_str=allow_primitive_to_str,
|
| 2113 |
+
allow_decimal_to_str=allow_decimal_to_str,
|
| 2114 |
+
)
|
| 2115 |
+
elif not isinstance(feature, (Sequence, dict, list, tuple)):
|
| 2116 |
+
return array_cast(
|
| 2117 |
+
array,
|
| 2118 |
+
feature(),
|
| 2119 |
+
allow_primitive_to_str=allow_primitive_to_str,
|
| 2120 |
+
allow_decimal_to_str=allow_decimal_to_str,
|
| 2121 |
+
)
|
| 2122 |
+
raise TypeError(f"Couldn't cast array of type\n{_short_str(array.type)}\nto\n{_short_str(feature)}")
|
| 2123 |
+
|
| 2124 |
+
|
| 2125 |
+
@_wrap_for_chunked_arrays
|
| 2126 |
+
def embed_array_storage(array: pa.Array, feature: "FeatureType"):
|
| 2127 |
+
"""Embed data into an arrays's storage.
|
| 2128 |
+
For custom features like Audio or Image, it takes into account the "embed_storage" methods
|
| 2129 |
+
they define to embed external data (e.g. an image file) into an array.
|
| 2130 |
+
|
| 2131 |
+
<Added version="2.4.0"/>
|
| 2132 |
+
|
| 2133 |
+
Args:
|
| 2134 |
+
array (`pa.Array`):
|
| 2135 |
+
The PyArrow array in which to embed data.
|
| 2136 |
+
feature (`datasets.features.FeatureType`):
|
| 2137 |
+
Array features.
|
| 2138 |
+
|
| 2139 |
+
Raises:
|
| 2140 |
+
`TypeError`: if the target type is not supported according, e.g.
|
| 2141 |
+
|
| 2142 |
+
- if a field is missing
|
| 2143 |
+
|
| 2144 |
+
Returns:
|
| 2145 |
+
array (`pyarrow.Array`): the casted array
|
| 2146 |
+
"""
|
| 2147 |
+
from .features import Sequence
|
| 2148 |
+
|
| 2149 |
+
_e = embed_array_storage
|
| 2150 |
+
|
| 2151 |
+
if isinstance(array, pa.ExtensionArray):
|
| 2152 |
+
array = array.storage
|
| 2153 |
+
if hasattr(feature, "embed_storage"):
|
| 2154 |
+
return feature.embed_storage(array)
|
| 2155 |
+
elif pa.types.is_struct(array.type):
|
| 2156 |
+
# feature must be a dict or Sequence(subfeatures_dict)
|
| 2157 |
+
if isinstance(feature, Sequence) and isinstance(feature.feature, dict):
|
| 2158 |
+
feature = {
|
| 2159 |
+
name: Sequence(subfeature, length=feature.length) for name, subfeature in feature.feature.items()
|
| 2160 |
+
}
|
| 2161 |
+
if isinstance(feature, dict):
|
| 2162 |
+
arrays = [_e(array.field(name), subfeature) for name, subfeature in feature.items()]
|
| 2163 |
+
return pa.StructArray.from_arrays(arrays, names=list(feature), mask=array.is_null())
|
| 2164 |
+
elif pa.types.is_list(array.type):
|
| 2165 |
+
# feature must be either [subfeature] or Sequence(subfeature)
|
| 2166 |
+
# Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError
|
| 2167 |
+
array_offsets = _combine_list_array_offsets_with_mask(array)
|
| 2168 |
+
if isinstance(feature, list):
|
| 2169 |
+
return pa.ListArray.from_arrays(array_offsets, _e(array.values, feature[0]))
|
| 2170 |
+
if isinstance(feature, Sequence) and feature.length == -1:
|
| 2171 |
+
return pa.ListArray.from_arrays(array_offsets, _e(array.values, feature.feature))
|
| 2172 |
+
elif pa.types.is_fixed_size_list(array.type):
|
| 2173 |
+
# feature must be Sequence(subfeature)
|
| 2174 |
+
if isinstance(feature, Sequence) and feature.length > -1:
|
| 2175 |
+
array_values = array.values[
|
| 2176 |
+
array.offset * array.type.list_size : (array.offset + len(array)) * array.type.list_size
|
| 2177 |
+
]
|
| 2178 |
+
embedded_array_values = _e(array_values, feature.feature)
|
| 2179 |
+
if config.PYARROW_VERSION.major < 15:
|
| 2180 |
+
return pa.Array.from_buffers(
|
| 2181 |
+
pa.list_(array_values.type, feature.length),
|
| 2182 |
+
len(array),
|
| 2183 |
+
[array.is_valid().buffers()[1]],
|
| 2184 |
+
children=[embedded_array_values],
|
| 2185 |
+
)
|
| 2186 |
+
else:
|
| 2187 |
+
return pa.FixedSizeListArray.from_arrays(embedded_array_values, feature.length, mask=array.is_null())
|
| 2188 |
+
if not isinstance(feature, (Sequence, dict, list, tuple)):
|
| 2189 |
+
return array
|
| 2190 |
+
raise TypeError(f"Couldn't embed array of type\n{_short_str(array.type)}\nwith\n{_short_str(feature)}")
|
| 2191 |
+
|
| 2192 |
+
|
| 2193 |
+
class CastError(ValueError):
|
| 2194 |
+
"""When it's not possible to cast an Arrow table to a specific schema or set of features"""
|
| 2195 |
+
|
| 2196 |
+
def __init__(self, *args, table_column_names: List[str], requested_column_names: List[str]) -> None:
|
| 2197 |
+
super().__init__(*args)
|
| 2198 |
+
self.table_column_names = table_column_names
|
| 2199 |
+
self.requested_column_names = requested_column_names
|
| 2200 |
+
|
| 2201 |
+
def __reduce__(self):
|
| 2202 |
+
# Fix unpickling: TypeError: __init__() missing 2 required keyword-only arguments: 'table_column_names' and 'requested_column_names'
|
| 2203 |
+
return partial(
|
| 2204 |
+
CastError, table_column_names=self.table_column_names, requested_column_names=self.requested_column_names
|
| 2205 |
+
), ()
|
| 2206 |
+
|
| 2207 |
+
def details(self):
|
| 2208 |
+
new_columns = set(self.table_column_names) - set(self.requested_column_names)
|
| 2209 |
+
missing_columns = set(self.requested_column_names) - set(self.table_column_names)
|
| 2210 |
+
if new_columns and missing_columns:
|
| 2211 |
+
return f"there are {len(new_columns)} new columns ({_short_str(new_columns)}) and {len(missing_columns)} missing columns ({_short_str(missing_columns)})."
|
| 2212 |
+
elif new_columns:
|
| 2213 |
+
return f"there are {len(new_columns)} new columns ({_short_str(new_columns)})"
|
| 2214 |
+
else:
|
| 2215 |
+
return f"there are {len(missing_columns)} missing columns ({_short_str(missing_columns)})"
|
| 2216 |
+
|
| 2217 |
+
|
| 2218 |
+
def cast_table_to_features(table: pa.Table, features: "Features"):
|
| 2219 |
+
"""Cast a table to the arrow schema that corresponds to the requested features.
|
| 2220 |
+
|
| 2221 |
+
Args:
|
| 2222 |
+
table (`pyarrow.Table`):
|
| 2223 |
+
PyArrow table to cast.
|
| 2224 |
+
features ([`Features`]):
|
| 2225 |
+
Target features.
|
| 2226 |
+
|
| 2227 |
+
Returns:
|
| 2228 |
+
table (`pyarrow.Table`): the casted table
|
| 2229 |
+
"""
|
| 2230 |
+
if sorted(table.column_names) != sorted(features):
|
| 2231 |
+
raise CastError(
|
| 2232 |
+
f"Couldn't cast\n{_short_str(table.schema)}\nto\n{_short_str(features)}\nbecause column names don't match",
|
| 2233 |
+
table_column_names=table.column_names,
|
| 2234 |
+
requested_column_names=list(features),
|
| 2235 |
+
)
|
| 2236 |
+
arrays = [cast_array_to_feature(table[name], feature) for name, feature in features.items()]
|
| 2237 |
+
return pa.Table.from_arrays(arrays, schema=features.arrow_schema)
|
| 2238 |
+
|
| 2239 |
+
|
| 2240 |
+
def cast_table_to_schema(table: pa.Table, schema: pa.Schema):
|
| 2241 |
+
"""Cast a table to the arrow schema. Different from `cast_table_to_features`, this method can preserve nullability.
|
| 2242 |
+
|
| 2243 |
+
Args:
|
| 2244 |
+
table (`pa.Table`):
|
| 2245 |
+
PyArrow table to cast.
|
| 2246 |
+
features ([`Features`]):
|
| 2247 |
+
Target features.
|
| 2248 |
+
|
| 2249 |
+
Returns:
|
| 2250 |
+
`pa.Table`: the casted table
|
| 2251 |
+
"""
|
| 2252 |
+
from .features import Features
|
| 2253 |
+
|
| 2254 |
+
features = Features.from_arrow_schema(schema)
|
| 2255 |
+
if sorted(table.column_names) != sorted(features):
|
| 2256 |
+
raise CastError(
|
| 2257 |
+
f"Couldn't cast\n{_short_str(table.schema)}\nto\n{_short_str(features)}\nbecause column names don't match",
|
| 2258 |
+
table_column_names=table.column_names,
|
| 2259 |
+
requested_column_names=list(features),
|
| 2260 |
+
)
|
| 2261 |
+
arrays = [cast_array_to_feature(table[name], feature) for name, feature in features.items()]
|
| 2262 |
+
return pa.Table.from_arrays(arrays, schema=schema)
|
| 2263 |
+
|
| 2264 |
+
|
| 2265 |
+
def embed_table_storage(table: pa.Table):
|
| 2266 |
+
"""Embed external data into a table's storage.
|
| 2267 |
+
|
| 2268 |
+
<Added version="2.4.0"/>
|
| 2269 |
+
|
| 2270 |
+
Args:
|
| 2271 |
+
table (`pyarrow.Table`):
|
| 2272 |
+
PyArrow table in which to embed data.
|
| 2273 |
+
|
| 2274 |
+
Returns:
|
| 2275 |
+
table (`pyarrow.Table`): the table with embedded data
|
| 2276 |
+
"""
|
| 2277 |
+
from .features.features import Features, require_storage_embed
|
| 2278 |
+
|
| 2279 |
+
features = Features.from_arrow_schema(table.schema)
|
| 2280 |
+
arrays = [
|
| 2281 |
+
embed_array_storage(table[name], feature) if require_storage_embed(feature) else table[name]
|
| 2282 |
+
for name, feature in features.items()
|
| 2283 |
+
]
|
| 2284 |
+
return pa.Table.from_arrays(arrays, schema=features.arrow_schema)
|
| 2285 |
+
|
| 2286 |
+
|
| 2287 |
+
def table_cast(table: pa.Table, schema: pa.Schema):
|
| 2288 |
+
"""Improved version of `pa.Table.cast`.
|
| 2289 |
+
|
| 2290 |
+
It supports casting to feature types stored in the schema metadata.
|
| 2291 |
+
|
| 2292 |
+
Args:
|
| 2293 |
+
table (`pyarrow.Table`):
|
| 2294 |
+
PyArrow table to cast.
|
| 2295 |
+
schema (`pyarrow.Schema`):
|
| 2296 |
+
Target PyArrow schema.
|
| 2297 |
+
|
| 2298 |
+
Returns:
|
| 2299 |
+
table (`pyarrow.Table`): the casted table
|
| 2300 |
+
"""
|
| 2301 |
+
if table.schema != schema:
|
| 2302 |
+
return cast_table_to_schema(table, schema)
|
| 2303 |
+
elif table.schema.metadata != schema.metadata:
|
| 2304 |
+
return table.replace_schema_metadata(schema.metadata)
|
| 2305 |
+
else:
|
| 2306 |
+
return table
|
| 2307 |
+
|
| 2308 |
+
|
| 2309 |
+
def table_flatten(table: pa.Table):
|
| 2310 |
+
"""Improved version of `pa.Table.flatten`.
|
| 2311 |
+
|
| 2312 |
+
It behaves as `pa.Table.flatten` in a sense it does 1-step flatten of the columns with a struct type into one column per struct field,
|
| 2313 |
+
but updates the metadata and skips decodable features unless the `decode` attribute of these features is set to False.
|
| 2314 |
+
|
| 2315 |
+
Args:
|
| 2316 |
+
table (`pa.Table`):
|
| 2317 |
+
PyArrow table to flatten.
|
| 2318 |
+
|
| 2319 |
+
Returns:
|
| 2320 |
+
`Table`: the flattened table
|
| 2321 |
+
"""
|
| 2322 |
+
from .features import Features
|
| 2323 |
+
|
| 2324 |
+
features = Features.from_arrow_schema(table.schema)
|
| 2325 |
+
if any(hasattr(subfeature, "flatten") and subfeature.flatten() == subfeature for subfeature in features.values()):
|
| 2326 |
+
flat_arrays = []
|
| 2327 |
+
flat_column_names = []
|
| 2328 |
+
for field in table.schema:
|
| 2329 |
+
array = table.column(field.name)
|
| 2330 |
+
subfeature = features[field.name]
|
| 2331 |
+
if pa.types.is_struct(field.type) and (
|
| 2332 |
+
not hasattr(subfeature, "flatten") or subfeature.flatten() != subfeature
|
| 2333 |
+
):
|
| 2334 |
+
flat_arrays.extend(array.flatten())
|
| 2335 |
+
flat_column_names.extend([f"{field.name}.{subfield.name}" for subfield in field.type])
|
| 2336 |
+
else:
|
| 2337 |
+
flat_arrays.append(array)
|
| 2338 |
+
flat_column_names.append(field.name)
|
| 2339 |
+
flat_table = pa.Table.from_arrays(
|
| 2340 |
+
flat_arrays,
|
| 2341 |
+
names=flat_column_names,
|
| 2342 |
+
)
|
| 2343 |
+
else:
|
| 2344 |
+
flat_table = table.flatten()
|
| 2345 |
+
# Preserve complex types in the metadata
|
| 2346 |
+
flat_features = features.flatten(max_depth=2)
|
| 2347 |
+
flat_features = Features({column_name: flat_features[column_name] for column_name in flat_table.column_names})
|
| 2348 |
+
return flat_table.replace_schema_metadata(flat_features.arrow_schema.metadata)
|
| 2349 |
+
|
| 2350 |
+
|
| 2351 |
+
def table_visitor(table: pa.Table, function: Callable[[pa.Array], None]):
|
| 2352 |
+
"""Visit all arrays in a table and apply a function to them.
|
| 2353 |
+
|
| 2354 |
+
Args:
|
| 2355 |
+
table (`pyarrow.Table`):
|
| 2356 |
+
PyArrow table to visit.
|
| 2357 |
+
function (`Callable[[pa.Array], None]`):
|
| 2358 |
+
Function to apply to each array.
|
| 2359 |
+
"""
|
| 2360 |
+
from .features import Features, Sequence
|
| 2361 |
+
|
| 2362 |
+
features = Features.from_arrow_schema(table.schema)
|
| 2363 |
+
|
| 2364 |
+
def _visit(array, feature):
|
| 2365 |
+
if isinstance(array, pa.ChunkedArray):
|
| 2366 |
+
for chunk in array.chunks:
|
| 2367 |
+
_visit(chunk, feature)
|
| 2368 |
+
else:
|
| 2369 |
+
if isinstance(array, pa.ExtensionArray):
|
| 2370 |
+
array = array.storage
|
| 2371 |
+
function(array, feature)
|
| 2372 |
+
if pa.types.is_struct(array.type) and not hasattr(feature, "cast_storage"):
|
| 2373 |
+
if isinstance(feature, Sequence) and isinstance(feature.feature, dict):
|
| 2374 |
+
feature = {
|
| 2375 |
+
name: Sequence(subfeature, length=feature.length)
|
| 2376 |
+
for name, subfeature in feature.feature.items()
|
| 2377 |
+
}
|
| 2378 |
+
for name, subfeature in feature.items():
|
| 2379 |
+
_visit(array.field(name), subfeature)
|
| 2380 |
+
elif pa.types.is_list(array.type):
|
| 2381 |
+
if isinstance(feature, list):
|
| 2382 |
+
_visit(array.values, feature[0])
|
| 2383 |
+
elif isinstance(feature, Sequence):
|
| 2384 |
+
_visit(array.values, feature.feature)
|
| 2385 |
+
|
| 2386 |
+
for name, feature in features.items():
|
| 2387 |
+
_visit(table[name], feature)
|
| 2388 |
+
|
| 2389 |
+
|
| 2390 |
+
def table_iter(table: Table, batch_size: int, drop_last_batch=False) -> Iterator[pa.Table]:
|
| 2391 |
+
"""Iterate over sub-tables of size `batch_size`.
|
| 2392 |
+
|
| 2393 |
+
Args:
|
| 2394 |
+
table (`pyarrow.Table`):
|
| 2395 |
+
PyArrow table to iterate over.
|
| 2396 |
+
batch_size (`int`):
|
| 2397 |
+
Size of each sub-table to yield.
|
| 2398 |
+
drop_last_batch (`bool`, defaults to `False`):
|
| 2399 |
+
Drop the last batch if it is smaller than `batch_size`.
|
| 2400 |
+
"""
|
| 2401 |
+
chunks_buffer = []
|
| 2402 |
+
chunks_buffer_size = 0
|
| 2403 |
+
for chunk in table.to_reader(max_chunksize=batch_size):
|
| 2404 |
+
if len(chunk) == 0:
|
| 2405 |
+
continue
|
| 2406 |
+
elif chunks_buffer_size + len(chunk) < batch_size:
|
| 2407 |
+
chunks_buffer.append(chunk)
|
| 2408 |
+
chunks_buffer_size += len(chunk)
|
| 2409 |
+
continue
|
| 2410 |
+
elif chunks_buffer_size + len(chunk) == batch_size:
|
| 2411 |
+
chunks_buffer.append(chunk)
|
| 2412 |
+
yield pa.Table.from_batches(chunks_buffer)
|
| 2413 |
+
chunks_buffer = []
|
| 2414 |
+
chunks_buffer_size = 0
|
| 2415 |
+
else:
|
| 2416 |
+
cropped_chunk_length = batch_size - chunks_buffer_size
|
| 2417 |
+
chunks_buffer.append(chunk.slice(0, cropped_chunk_length))
|
| 2418 |
+
yield pa.Table.from_batches(chunks_buffer)
|
| 2419 |
+
chunks_buffer = [chunk.slice(cropped_chunk_length, len(chunk) - cropped_chunk_length)]
|
| 2420 |
+
chunks_buffer_size = len(chunk) - cropped_chunk_length
|
| 2421 |
+
if not drop_last_batch and chunks_buffer:
|
| 2422 |
+
yield pa.Table.from_batches(chunks_buffer)
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/frozenlist/__init__.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import types
|
| 4 |
+
from collections.abc import MutableSequence
|
| 5 |
+
from functools import total_ordering
|
| 6 |
+
from typing import Any, Type
|
| 7 |
+
|
| 8 |
+
__version__ = "1.5.0"
|
| 9 |
+
|
| 10 |
+
__all__ = ("FrozenList", "PyFrozenList") # type: Tuple[str, ...]
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
NO_EXTENSIONS = bool(os.environ.get("FROZENLIST_NO_EXTENSIONS")) # type: bool
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@total_ordering
|
| 17 |
+
class FrozenList(MutableSequence):
|
| 18 |
+
__slots__ = ("_frozen", "_items")
|
| 19 |
+
|
| 20 |
+
if sys.version_info >= (3, 9):
|
| 21 |
+
__class_getitem__ = classmethod(types.GenericAlias)
|
| 22 |
+
else:
|
| 23 |
+
|
| 24 |
+
@classmethod
|
| 25 |
+
def __class_getitem__(
|
| 26 |
+
cls: Type["FrozenList"],
|
| 27 |
+
cls_item: Any,
|
| 28 |
+
) -> Type["FrozenList"]:
|
| 29 |
+
return cls
|
| 30 |
+
|
| 31 |
+
def __init__(self, items=None):
|
| 32 |
+
self._frozen = False
|
| 33 |
+
if items is not None:
|
| 34 |
+
items = list(items)
|
| 35 |
+
else:
|
| 36 |
+
items = []
|
| 37 |
+
self._items = items
|
| 38 |
+
|
| 39 |
+
@property
|
| 40 |
+
def frozen(self):
|
| 41 |
+
return self._frozen
|
| 42 |
+
|
| 43 |
+
def freeze(self):
|
| 44 |
+
self._frozen = True
|
| 45 |
+
|
| 46 |
+
def __getitem__(self, index):
|
| 47 |
+
return self._items[index]
|
| 48 |
+
|
| 49 |
+
def __setitem__(self, index, value):
|
| 50 |
+
if self._frozen:
|
| 51 |
+
raise RuntimeError("Cannot modify frozen list.")
|
| 52 |
+
self._items[index] = value
|
| 53 |
+
|
| 54 |
+
def __delitem__(self, index):
|
| 55 |
+
if self._frozen:
|
| 56 |
+
raise RuntimeError("Cannot modify frozen list.")
|
| 57 |
+
del self._items[index]
|
| 58 |
+
|
| 59 |
+
def __len__(self):
|
| 60 |
+
return self._items.__len__()
|
| 61 |
+
|
| 62 |
+
def __iter__(self):
|
| 63 |
+
return self._items.__iter__()
|
| 64 |
+
|
| 65 |
+
def __reversed__(self):
|
| 66 |
+
return self._items.__reversed__()
|
| 67 |
+
|
| 68 |
+
def __eq__(self, other):
|
| 69 |
+
return list(self) == other
|
| 70 |
+
|
| 71 |
+
def __le__(self, other):
|
| 72 |
+
return list(self) <= other
|
| 73 |
+
|
| 74 |
+
def insert(self, pos, item):
|
| 75 |
+
if self._frozen:
|
| 76 |
+
raise RuntimeError("Cannot modify frozen list.")
|
| 77 |
+
self._items.insert(pos, item)
|
| 78 |
+
|
| 79 |
+
def __repr__(self):
|
| 80 |
+
return f"<FrozenList(frozen={self._frozen}, {self._items!r})>"
|
| 81 |
+
|
| 82 |
+
def __hash__(self):
|
| 83 |
+
if self._frozen:
|
| 84 |
+
return hash(tuple(self))
|
| 85 |
+
else:
|
| 86 |
+
raise RuntimeError("Cannot hash unfrozen list.")
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
PyFrozenList = FrozenList
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
if not NO_EXTENSIONS:
|
| 93 |
+
try:
|
| 94 |
+
from ._frozenlist import FrozenList as CFrozenList # type: ignore
|
| 95 |
+
except ImportError: # pragma: no cover
|
| 96 |
+
pass
|
| 97 |
+
else:
|
| 98 |
+
FrozenList = CFrozenList # type: ignore
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/frozenlist/__init__.pyi
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import (
|
| 2 |
+
Generic,
|
| 3 |
+
Iterable,
|
| 4 |
+
Iterator,
|
| 5 |
+
List,
|
| 6 |
+
MutableSequence,
|
| 7 |
+
Optional,
|
| 8 |
+
TypeVar,
|
| 9 |
+
Union,
|
| 10 |
+
overload,
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
_T = TypeVar("_T")
|
| 14 |
+
_Arg = Union[List[_T], Iterable[_T]]
|
| 15 |
+
|
| 16 |
+
class FrozenList(MutableSequence[_T], Generic[_T]):
|
| 17 |
+
def __init__(self, items: Optional[_Arg[_T]] = None) -> None: ...
|
| 18 |
+
@property
|
| 19 |
+
def frozen(self) -> bool: ...
|
| 20 |
+
def freeze(self) -> None: ...
|
| 21 |
+
@overload
|
| 22 |
+
def __getitem__(self, i: int) -> _T: ...
|
| 23 |
+
@overload
|
| 24 |
+
def __getitem__(self, s: slice) -> FrozenList[_T]: ...
|
| 25 |
+
@overload
|
| 26 |
+
def __setitem__(self, i: int, o: _T) -> None: ...
|
| 27 |
+
@overload
|
| 28 |
+
def __setitem__(self, s: slice, o: Iterable[_T]) -> None: ...
|
| 29 |
+
@overload
|
| 30 |
+
def __delitem__(self, i: int) -> None: ...
|
| 31 |
+
@overload
|
| 32 |
+
def __delitem__(self, i: slice) -> None: ...
|
| 33 |
+
def __len__(self) -> int: ...
|
| 34 |
+
def __iter__(self) -> Iterator[_T]: ...
|
| 35 |
+
def __reversed__(self) -> Iterator[_T]: ...
|
| 36 |
+
def __eq__(self, other: object) -> bool: ...
|
| 37 |
+
def __le__(self, other: FrozenList[_T]) -> bool: ...
|
| 38 |
+
def __ne__(self, other: object) -> bool: ...
|
| 39 |
+
def __lt__(self, other: FrozenList[_T]) -> bool: ...
|
| 40 |
+
def __ge__(self, other: FrozenList[_T]) -> bool: ...
|
| 41 |
+
def __gt__(self, other: FrozenList[_T]) -> bool: ...
|
| 42 |
+
def insert(self, pos: int, item: _T) -> None: ...
|
| 43 |
+
def __repr__(self) -> str: ...
|
| 44 |
+
def __hash__(self) -> int: ...
|
| 45 |
+
|
| 46 |
+
# types for C accelerators are the same
|
| 47 |
+
CFrozenList = PyFrozenList = FrozenList
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/frozenlist/_frozenlist.pyx
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import types
|
| 3 |
+
from collections.abc import MutableSequence
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
cdef class FrozenList:
|
| 7 |
+
|
| 8 |
+
if sys.version_info >= (3, 9):
|
| 9 |
+
__class_getitem__ = classmethod(types.GenericAlias)
|
| 10 |
+
else:
|
| 11 |
+
@classmethod
|
| 12 |
+
def __class_getitem__(cls, cls_item):
|
| 13 |
+
return cls
|
| 14 |
+
|
| 15 |
+
cdef readonly bint frozen
|
| 16 |
+
cdef list _items
|
| 17 |
+
|
| 18 |
+
def __init__(self, items=None):
|
| 19 |
+
self.frozen = False
|
| 20 |
+
if items is not None:
|
| 21 |
+
items = list(items)
|
| 22 |
+
else:
|
| 23 |
+
items = []
|
| 24 |
+
self._items = items
|
| 25 |
+
|
| 26 |
+
cdef object _check_frozen(self):
|
| 27 |
+
if self.frozen:
|
| 28 |
+
raise RuntimeError("Cannot modify frozen list.")
|
| 29 |
+
|
| 30 |
+
cdef inline object _fast_len(self):
|
| 31 |
+
return len(self._items)
|
| 32 |
+
|
| 33 |
+
def freeze(self):
|
| 34 |
+
self.frozen = True
|
| 35 |
+
|
| 36 |
+
def __getitem__(self, index):
|
| 37 |
+
return self._items[index]
|
| 38 |
+
|
| 39 |
+
def __setitem__(self, index, value):
|
| 40 |
+
self._check_frozen()
|
| 41 |
+
self._items[index] = value
|
| 42 |
+
|
| 43 |
+
def __delitem__(self, index):
|
| 44 |
+
self._check_frozen()
|
| 45 |
+
del self._items[index]
|
| 46 |
+
|
| 47 |
+
def __len__(self):
|
| 48 |
+
return self._fast_len()
|
| 49 |
+
|
| 50 |
+
def __iter__(self):
|
| 51 |
+
return self._items.__iter__()
|
| 52 |
+
|
| 53 |
+
def __reversed__(self):
|
| 54 |
+
return self._items.__reversed__()
|
| 55 |
+
|
| 56 |
+
def __richcmp__(self, other, op):
|
| 57 |
+
if op == 0: # <
|
| 58 |
+
return list(self) < other
|
| 59 |
+
if op == 1: # <=
|
| 60 |
+
return list(self) <= other
|
| 61 |
+
if op == 2: # ==
|
| 62 |
+
return list(self) == other
|
| 63 |
+
if op == 3: # !=
|
| 64 |
+
return list(self) != other
|
| 65 |
+
if op == 4: # >
|
| 66 |
+
return list(self) > other
|
| 67 |
+
if op == 5: # =>
|
| 68 |
+
return list(self) >= other
|
| 69 |
+
|
| 70 |
+
def insert(self, pos, item):
|
| 71 |
+
self._check_frozen()
|
| 72 |
+
self._items.insert(pos, item)
|
| 73 |
+
|
| 74 |
+
def __contains__(self, item):
|
| 75 |
+
return item in self._items
|
| 76 |
+
|
| 77 |
+
def __iadd__(self, items):
|
| 78 |
+
self._check_frozen()
|
| 79 |
+
self._items += list(items)
|
| 80 |
+
return self
|
| 81 |
+
|
| 82 |
+
def index(self, item):
|
| 83 |
+
return self._items.index(item)
|
| 84 |
+
|
| 85 |
+
def remove(self, item):
|
| 86 |
+
self._check_frozen()
|
| 87 |
+
self._items.remove(item)
|
| 88 |
+
|
| 89 |
+
def clear(self):
|
| 90 |
+
self._check_frozen()
|
| 91 |
+
self._items.clear()
|
| 92 |
+
|
| 93 |
+
def extend(self, items):
|
| 94 |
+
self._check_frozen()
|
| 95 |
+
self._items += list(items)
|
| 96 |
+
|
| 97 |
+
def reverse(self):
|
| 98 |
+
self._check_frozen()
|
| 99 |
+
self._items.reverse()
|
| 100 |
+
|
| 101 |
+
def pop(self, index=-1):
|
| 102 |
+
self._check_frozen()
|
| 103 |
+
return self._items.pop(index)
|
| 104 |
+
|
| 105 |
+
def append(self, item):
|
| 106 |
+
self._check_frozen()
|
| 107 |
+
return self._items.append(item)
|
| 108 |
+
|
| 109 |
+
def count(self, item):
|
| 110 |
+
return self._items.count(item)
|
| 111 |
+
|
| 112 |
+
def __repr__(self):
|
| 113 |
+
return '<FrozenList(frozen={}, {!r})>'.format(self.frozen,
|
| 114 |
+
self._items)
|
| 115 |
+
|
| 116 |
+
def __hash__(self):
|
| 117 |
+
if self.frozen:
|
| 118 |
+
return hash(tuple(self._items))
|
| 119 |
+
else:
|
| 120 |
+
raise RuntimeError("Cannot hash unfrozen list.")
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
MutableSequence.register(FrozenList)
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/frozenlist/py.typed
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Marker
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_models.py
ADDED
|
@@ -0,0 +1,516 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import base64
|
| 4 |
+
import ssl
|
| 5 |
+
import typing
|
| 6 |
+
import urllib.parse
|
| 7 |
+
|
| 8 |
+
# Functions for typechecking...
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
ByteOrStr = typing.Union[bytes, str]
|
| 12 |
+
HeadersAsSequence = typing.Sequence[typing.Tuple[ByteOrStr, ByteOrStr]]
|
| 13 |
+
HeadersAsMapping = typing.Mapping[ByteOrStr, ByteOrStr]
|
| 14 |
+
HeaderTypes = typing.Union[HeadersAsSequence, HeadersAsMapping, None]
|
| 15 |
+
|
| 16 |
+
Extensions = typing.MutableMapping[str, typing.Any]
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def enforce_bytes(value: bytes | str, *, name: str) -> bytes:
|
| 20 |
+
"""
|
| 21 |
+
Any arguments that are ultimately represented as bytes can be specified
|
| 22 |
+
either as bytes or as strings.
|
| 23 |
+
|
| 24 |
+
However we enforce that any string arguments must only contain characters in
|
| 25 |
+
the plain ASCII range. chr(0)...chr(127). If you need to use characters
|
| 26 |
+
outside that range then be precise, and use a byte-wise argument.
|
| 27 |
+
"""
|
| 28 |
+
if isinstance(value, str):
|
| 29 |
+
try:
|
| 30 |
+
return value.encode("ascii")
|
| 31 |
+
except UnicodeEncodeError:
|
| 32 |
+
raise TypeError(f"{name} strings may not include unicode characters.")
|
| 33 |
+
elif isinstance(value, bytes):
|
| 34 |
+
return value
|
| 35 |
+
|
| 36 |
+
seen_type = type(value).__name__
|
| 37 |
+
raise TypeError(f"{name} must be bytes or str, but got {seen_type}.")
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def enforce_url(value: URL | bytes | str, *, name: str) -> URL:
|
| 41 |
+
"""
|
| 42 |
+
Type check for URL parameters.
|
| 43 |
+
"""
|
| 44 |
+
if isinstance(value, (bytes, str)):
|
| 45 |
+
return URL(value)
|
| 46 |
+
elif isinstance(value, URL):
|
| 47 |
+
return value
|
| 48 |
+
|
| 49 |
+
seen_type = type(value).__name__
|
| 50 |
+
raise TypeError(f"{name} must be a URL, bytes, or str, but got {seen_type}.")
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def enforce_headers(
|
| 54 |
+
value: HeadersAsMapping | HeadersAsSequence | None = None, *, name: str
|
| 55 |
+
) -> list[tuple[bytes, bytes]]:
|
| 56 |
+
"""
|
| 57 |
+
Convienence function that ensure all items in request or response headers
|
| 58 |
+
are either bytes or strings in the plain ASCII range.
|
| 59 |
+
"""
|
| 60 |
+
if value is None:
|
| 61 |
+
return []
|
| 62 |
+
elif isinstance(value, typing.Mapping):
|
| 63 |
+
return [
|
| 64 |
+
(
|
| 65 |
+
enforce_bytes(k, name="header name"),
|
| 66 |
+
enforce_bytes(v, name="header value"),
|
| 67 |
+
)
|
| 68 |
+
for k, v in value.items()
|
| 69 |
+
]
|
| 70 |
+
elif isinstance(value, typing.Sequence):
|
| 71 |
+
return [
|
| 72 |
+
(
|
| 73 |
+
enforce_bytes(k, name="header name"),
|
| 74 |
+
enforce_bytes(v, name="header value"),
|
| 75 |
+
)
|
| 76 |
+
for k, v in value
|
| 77 |
+
]
|
| 78 |
+
|
| 79 |
+
seen_type = type(value).__name__
|
| 80 |
+
raise TypeError(
|
| 81 |
+
f"{name} must be a mapping or sequence of two-tuples, but got {seen_type}."
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def enforce_stream(
|
| 86 |
+
value: bytes | typing.Iterable[bytes] | typing.AsyncIterable[bytes] | None,
|
| 87 |
+
*,
|
| 88 |
+
name: str,
|
| 89 |
+
) -> typing.Iterable[bytes] | typing.AsyncIterable[bytes]:
|
| 90 |
+
if value is None:
|
| 91 |
+
return ByteStream(b"")
|
| 92 |
+
elif isinstance(value, bytes):
|
| 93 |
+
return ByteStream(value)
|
| 94 |
+
return value
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
# * https://tools.ietf.org/html/rfc3986#section-3.2.3
|
| 98 |
+
# * https://url.spec.whatwg.org/#url-miscellaneous
|
| 99 |
+
# * https://url.spec.whatwg.org/#scheme-state
|
| 100 |
+
DEFAULT_PORTS = {
|
| 101 |
+
b"ftp": 21,
|
| 102 |
+
b"http": 80,
|
| 103 |
+
b"https": 443,
|
| 104 |
+
b"ws": 80,
|
| 105 |
+
b"wss": 443,
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def include_request_headers(
|
| 110 |
+
headers: list[tuple[bytes, bytes]],
|
| 111 |
+
*,
|
| 112 |
+
url: "URL",
|
| 113 |
+
content: None | bytes | typing.Iterable[bytes] | typing.AsyncIterable[bytes],
|
| 114 |
+
) -> list[tuple[bytes, bytes]]:
|
| 115 |
+
headers_set = set(k.lower() for k, v in headers)
|
| 116 |
+
|
| 117 |
+
if b"host" not in headers_set:
|
| 118 |
+
default_port = DEFAULT_PORTS.get(url.scheme)
|
| 119 |
+
if url.port is None or url.port == default_port:
|
| 120 |
+
header_value = url.host
|
| 121 |
+
else:
|
| 122 |
+
header_value = b"%b:%d" % (url.host, url.port)
|
| 123 |
+
headers = [(b"Host", header_value)] + headers
|
| 124 |
+
|
| 125 |
+
if (
|
| 126 |
+
content is not None
|
| 127 |
+
and b"content-length" not in headers_set
|
| 128 |
+
and b"transfer-encoding" not in headers_set
|
| 129 |
+
):
|
| 130 |
+
if isinstance(content, bytes):
|
| 131 |
+
content_length = str(len(content)).encode("ascii")
|
| 132 |
+
headers += [(b"Content-Length", content_length)]
|
| 133 |
+
else:
|
| 134 |
+
headers += [(b"Transfer-Encoding", b"chunked")] # pragma: nocover
|
| 135 |
+
|
| 136 |
+
return headers
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
# Interfaces for byte streams...
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
class ByteStream:
|
| 143 |
+
"""
|
| 144 |
+
A container for non-streaming content, and that supports both sync and async
|
| 145 |
+
stream iteration.
|
| 146 |
+
"""
|
| 147 |
+
|
| 148 |
+
def __init__(self, content: bytes) -> None:
|
| 149 |
+
self._content = content
|
| 150 |
+
|
| 151 |
+
def __iter__(self) -> typing.Iterator[bytes]:
|
| 152 |
+
yield self._content
|
| 153 |
+
|
| 154 |
+
async def __aiter__(self) -> typing.AsyncIterator[bytes]:
|
| 155 |
+
yield self._content
|
| 156 |
+
|
| 157 |
+
def __repr__(self) -> str:
|
| 158 |
+
return f"<{self.__class__.__name__} [{len(self._content)} bytes]>"
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
class Origin:
|
| 162 |
+
def __init__(self, scheme: bytes, host: bytes, port: int) -> None:
|
| 163 |
+
self.scheme = scheme
|
| 164 |
+
self.host = host
|
| 165 |
+
self.port = port
|
| 166 |
+
|
| 167 |
+
def __eq__(self, other: typing.Any) -> bool:
|
| 168 |
+
return (
|
| 169 |
+
isinstance(other, Origin)
|
| 170 |
+
and self.scheme == other.scheme
|
| 171 |
+
and self.host == other.host
|
| 172 |
+
and self.port == other.port
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
def __str__(self) -> str:
|
| 176 |
+
scheme = self.scheme.decode("ascii")
|
| 177 |
+
host = self.host.decode("ascii")
|
| 178 |
+
port = str(self.port)
|
| 179 |
+
return f"{scheme}://{host}:{port}"
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
class URL:
|
| 183 |
+
"""
|
| 184 |
+
Represents the URL against which an HTTP request may be made.
|
| 185 |
+
|
| 186 |
+
The URL may either be specified as a plain string, for convienence:
|
| 187 |
+
|
| 188 |
+
```python
|
| 189 |
+
url = httpcore.URL("https://www.example.com/")
|
| 190 |
+
```
|
| 191 |
+
|
| 192 |
+
Or be constructed with explicitily pre-parsed components:
|
| 193 |
+
|
| 194 |
+
```python
|
| 195 |
+
url = httpcore.URL(scheme=b'https', host=b'www.example.com', port=None, target=b'/')
|
| 196 |
+
```
|
| 197 |
+
|
| 198 |
+
Using this second more explicit style allows integrations that are using
|
| 199 |
+
`httpcore` to pass through URLs that have already been parsed in order to use
|
| 200 |
+
libraries such as `rfc-3986` rather than relying on the stdlib. It also ensures
|
| 201 |
+
that URL parsing is treated identically at both the networking level and at any
|
| 202 |
+
higher layers of abstraction.
|
| 203 |
+
|
| 204 |
+
The four components are important here, as they allow the URL to be precisely
|
| 205 |
+
specified in a pre-parsed format. They also allow certain types of request to
|
| 206 |
+
be created that could not otherwise be expressed.
|
| 207 |
+
|
| 208 |
+
For example, an HTTP request to `http://www.example.com/` forwarded via a proxy
|
| 209 |
+
at `http://localhost:8080`...
|
| 210 |
+
|
| 211 |
+
```python
|
| 212 |
+
# Constructs an HTTP request with a complete URL as the target:
|
| 213 |
+
# GET https://www.example.com/ HTTP/1.1
|
| 214 |
+
url = httpcore.URL(
|
| 215 |
+
scheme=b'http',
|
| 216 |
+
host=b'localhost',
|
| 217 |
+
port=8080,
|
| 218 |
+
target=b'https://www.example.com/'
|
| 219 |
+
)
|
| 220 |
+
request = httpcore.Request(
|
| 221 |
+
method="GET",
|
| 222 |
+
url=url
|
| 223 |
+
)
|
| 224 |
+
```
|
| 225 |
+
|
| 226 |
+
Another example is constructing an `OPTIONS *` request...
|
| 227 |
+
|
| 228 |
+
```python
|
| 229 |
+
# Constructs an 'OPTIONS *' HTTP request:
|
| 230 |
+
# OPTIONS * HTTP/1.1
|
| 231 |
+
url = httpcore.URL(scheme=b'https', host=b'www.example.com', target=b'*')
|
| 232 |
+
request = httpcore.Request(method="OPTIONS", url=url)
|
| 233 |
+
```
|
| 234 |
+
|
| 235 |
+
This kind of request is not possible to formulate with a URL string,
|
| 236 |
+
because the `/` delimiter is always used to demark the target from the
|
| 237 |
+
host/port portion of the URL.
|
| 238 |
+
|
| 239 |
+
For convenience, string-like arguments may be specified either as strings or
|
| 240 |
+
as bytes. However, once a request is being issue over-the-wire, the URL
|
| 241 |
+
components are always ultimately required to be a bytewise representation.
|
| 242 |
+
|
| 243 |
+
In order to avoid any ambiguity over character encodings, when strings are used
|
| 244 |
+
as arguments, they must be strictly limited to the ASCII range `chr(0)`-`chr(127)`.
|
| 245 |
+
If you require a bytewise representation that is outside this range you must
|
| 246 |
+
handle the character encoding directly, and pass a bytes instance.
|
| 247 |
+
"""
|
| 248 |
+
|
| 249 |
+
def __init__(
|
| 250 |
+
self,
|
| 251 |
+
url: bytes | str = "",
|
| 252 |
+
*,
|
| 253 |
+
scheme: bytes | str = b"",
|
| 254 |
+
host: bytes | str = b"",
|
| 255 |
+
port: int | None = None,
|
| 256 |
+
target: bytes | str = b"",
|
| 257 |
+
) -> None:
|
| 258 |
+
"""
|
| 259 |
+
Parameters:
|
| 260 |
+
url: The complete URL as a string or bytes.
|
| 261 |
+
scheme: The URL scheme as a string or bytes.
|
| 262 |
+
Typically either `"http"` or `"https"`.
|
| 263 |
+
host: The URL host as a string or bytes. Such as `"www.example.com"`.
|
| 264 |
+
port: The port to connect to. Either an integer or `None`.
|
| 265 |
+
target: The target of the HTTP request. Such as `"/items?search=red"`.
|
| 266 |
+
"""
|
| 267 |
+
if url:
|
| 268 |
+
parsed = urllib.parse.urlparse(enforce_bytes(url, name="url"))
|
| 269 |
+
self.scheme = parsed.scheme
|
| 270 |
+
self.host = parsed.hostname or b""
|
| 271 |
+
self.port = parsed.port
|
| 272 |
+
self.target = (parsed.path or b"/") + (
|
| 273 |
+
b"?" + parsed.query if parsed.query else b""
|
| 274 |
+
)
|
| 275 |
+
else:
|
| 276 |
+
self.scheme = enforce_bytes(scheme, name="scheme")
|
| 277 |
+
self.host = enforce_bytes(host, name="host")
|
| 278 |
+
self.port = port
|
| 279 |
+
self.target = enforce_bytes(target, name="target")
|
| 280 |
+
|
| 281 |
+
@property
|
| 282 |
+
def origin(self) -> Origin:
|
| 283 |
+
default_port = {
|
| 284 |
+
b"http": 80,
|
| 285 |
+
b"https": 443,
|
| 286 |
+
b"ws": 80,
|
| 287 |
+
b"wss": 443,
|
| 288 |
+
b"socks5": 1080,
|
| 289 |
+
b"socks5h": 1080,
|
| 290 |
+
}[self.scheme]
|
| 291 |
+
return Origin(
|
| 292 |
+
scheme=self.scheme, host=self.host, port=self.port or default_port
|
| 293 |
+
)
|
| 294 |
+
|
| 295 |
+
def __eq__(self, other: typing.Any) -> bool:
|
| 296 |
+
return (
|
| 297 |
+
isinstance(other, URL)
|
| 298 |
+
and other.scheme == self.scheme
|
| 299 |
+
and other.host == self.host
|
| 300 |
+
and other.port == self.port
|
| 301 |
+
and other.target == self.target
|
| 302 |
+
)
|
| 303 |
+
|
| 304 |
+
def __bytes__(self) -> bytes:
|
| 305 |
+
if self.port is None:
|
| 306 |
+
return b"%b://%b%b" % (self.scheme, self.host, self.target)
|
| 307 |
+
return b"%b://%b:%d%b" % (self.scheme, self.host, self.port, self.target)
|
| 308 |
+
|
| 309 |
+
def __repr__(self) -> str:
|
| 310 |
+
return (
|
| 311 |
+
f"{self.__class__.__name__}(scheme={self.scheme!r}, "
|
| 312 |
+
f"host={self.host!r}, port={self.port!r}, target={self.target!r})"
|
| 313 |
+
)
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
class Request:
|
| 317 |
+
"""
|
| 318 |
+
An HTTP request.
|
| 319 |
+
"""
|
| 320 |
+
|
| 321 |
+
def __init__(
|
| 322 |
+
self,
|
| 323 |
+
method: bytes | str,
|
| 324 |
+
url: URL | bytes | str,
|
| 325 |
+
*,
|
| 326 |
+
headers: HeaderTypes = None,
|
| 327 |
+
content: bytes
|
| 328 |
+
| typing.Iterable[bytes]
|
| 329 |
+
| typing.AsyncIterable[bytes]
|
| 330 |
+
| None = None,
|
| 331 |
+
extensions: Extensions | None = None,
|
| 332 |
+
) -> None:
|
| 333 |
+
"""
|
| 334 |
+
Parameters:
|
| 335 |
+
method: The HTTP request method, either as a string or bytes.
|
| 336 |
+
For example: `GET`.
|
| 337 |
+
url: The request URL, either as a `URL` instance, or as a string or bytes.
|
| 338 |
+
For example: `"https://www.example.com".`
|
| 339 |
+
headers: The HTTP request headers.
|
| 340 |
+
content: The content of the request body.
|
| 341 |
+
extensions: A dictionary of optional extra information included on
|
| 342 |
+
the request. Possible keys include `"timeout"`, and `"trace"`.
|
| 343 |
+
"""
|
| 344 |
+
self.method: bytes = enforce_bytes(method, name="method")
|
| 345 |
+
self.url: URL = enforce_url(url, name="url")
|
| 346 |
+
self.headers: list[tuple[bytes, bytes]] = enforce_headers(
|
| 347 |
+
headers, name="headers"
|
| 348 |
+
)
|
| 349 |
+
self.stream: typing.Iterable[bytes] | typing.AsyncIterable[bytes] = (
|
| 350 |
+
enforce_stream(content, name="content")
|
| 351 |
+
)
|
| 352 |
+
self.extensions = {} if extensions is None else extensions
|
| 353 |
+
|
| 354 |
+
if "target" in self.extensions:
|
| 355 |
+
self.url = URL(
|
| 356 |
+
scheme=self.url.scheme,
|
| 357 |
+
host=self.url.host,
|
| 358 |
+
port=self.url.port,
|
| 359 |
+
target=self.extensions["target"],
|
| 360 |
+
)
|
| 361 |
+
|
| 362 |
+
def __repr__(self) -> str:
|
| 363 |
+
return f"<{self.__class__.__name__} [{self.method!r}]>"
|
| 364 |
+
|
| 365 |
+
|
| 366 |
+
class Response:
|
| 367 |
+
"""
|
| 368 |
+
An HTTP response.
|
| 369 |
+
"""
|
| 370 |
+
|
| 371 |
+
def __init__(
|
| 372 |
+
self,
|
| 373 |
+
status: int,
|
| 374 |
+
*,
|
| 375 |
+
headers: HeaderTypes = None,
|
| 376 |
+
content: bytes
|
| 377 |
+
| typing.Iterable[bytes]
|
| 378 |
+
| typing.AsyncIterable[bytes]
|
| 379 |
+
| None = None,
|
| 380 |
+
extensions: Extensions | None = None,
|
| 381 |
+
) -> None:
|
| 382 |
+
"""
|
| 383 |
+
Parameters:
|
| 384 |
+
status: The HTTP status code of the response. For example `200`.
|
| 385 |
+
headers: The HTTP response headers.
|
| 386 |
+
content: The content of the response body.
|
| 387 |
+
extensions: A dictionary of optional extra information included on
|
| 388 |
+
the responseself.Possible keys include `"http_version"`,
|
| 389 |
+
`"reason_phrase"`, and `"network_stream"`.
|
| 390 |
+
"""
|
| 391 |
+
self.status: int = status
|
| 392 |
+
self.headers: list[tuple[bytes, bytes]] = enforce_headers(
|
| 393 |
+
headers, name="headers"
|
| 394 |
+
)
|
| 395 |
+
self.stream: typing.Iterable[bytes] | typing.AsyncIterable[bytes] = (
|
| 396 |
+
enforce_stream(content, name="content")
|
| 397 |
+
)
|
| 398 |
+
self.extensions = {} if extensions is None else extensions
|
| 399 |
+
|
| 400 |
+
self._stream_consumed = False
|
| 401 |
+
|
| 402 |
+
@property
|
| 403 |
+
def content(self) -> bytes:
|
| 404 |
+
if not hasattr(self, "_content"):
|
| 405 |
+
if isinstance(self.stream, typing.Iterable):
|
| 406 |
+
raise RuntimeError(
|
| 407 |
+
"Attempted to access 'response.content' on a streaming response. "
|
| 408 |
+
"Call 'response.read()' first."
|
| 409 |
+
)
|
| 410 |
+
else:
|
| 411 |
+
raise RuntimeError(
|
| 412 |
+
"Attempted to access 'response.content' on a streaming response. "
|
| 413 |
+
"Call 'await response.aread()' first."
|
| 414 |
+
)
|
| 415 |
+
return self._content
|
| 416 |
+
|
| 417 |
+
def __repr__(self) -> str:
|
| 418 |
+
return f"<{self.__class__.__name__} [{self.status}]>"
|
| 419 |
+
|
| 420 |
+
# Sync interface...
|
| 421 |
+
|
| 422 |
+
def read(self) -> bytes:
|
| 423 |
+
if not isinstance(self.stream, typing.Iterable): # pragma: nocover
|
| 424 |
+
raise RuntimeError(
|
| 425 |
+
"Attempted to read an asynchronous response using 'response.read()'. "
|
| 426 |
+
"You should use 'await response.aread()' instead."
|
| 427 |
+
)
|
| 428 |
+
if not hasattr(self, "_content"):
|
| 429 |
+
self._content = b"".join([part for part in self.iter_stream()])
|
| 430 |
+
return self._content
|
| 431 |
+
|
| 432 |
+
def iter_stream(self) -> typing.Iterator[bytes]:
|
| 433 |
+
if not isinstance(self.stream, typing.Iterable): # pragma: nocover
|
| 434 |
+
raise RuntimeError(
|
| 435 |
+
"Attempted to stream an asynchronous response using 'for ... in "
|
| 436 |
+
"response.iter_stream()'. "
|
| 437 |
+
"You should use 'async for ... in response.aiter_stream()' instead."
|
| 438 |
+
)
|
| 439 |
+
if self._stream_consumed:
|
| 440 |
+
raise RuntimeError(
|
| 441 |
+
"Attempted to call 'for ... in response.iter_stream()' more than once."
|
| 442 |
+
)
|
| 443 |
+
self._stream_consumed = True
|
| 444 |
+
for chunk in self.stream:
|
| 445 |
+
yield chunk
|
| 446 |
+
|
| 447 |
+
def close(self) -> None:
|
| 448 |
+
if not isinstance(self.stream, typing.Iterable): # pragma: nocover
|
| 449 |
+
raise RuntimeError(
|
| 450 |
+
"Attempted to close an asynchronous response using 'response.close()'. "
|
| 451 |
+
"You should use 'await response.aclose()' instead."
|
| 452 |
+
)
|
| 453 |
+
if hasattr(self.stream, "close"):
|
| 454 |
+
self.stream.close()
|
| 455 |
+
|
| 456 |
+
# Async interface...
|
| 457 |
+
|
| 458 |
+
async def aread(self) -> bytes:
|
| 459 |
+
if not isinstance(self.stream, typing.AsyncIterable): # pragma: nocover
|
| 460 |
+
raise RuntimeError(
|
| 461 |
+
"Attempted to read an synchronous response using "
|
| 462 |
+
"'await response.aread()'. "
|
| 463 |
+
"You should use 'response.read()' instead."
|
| 464 |
+
)
|
| 465 |
+
if not hasattr(self, "_content"):
|
| 466 |
+
self._content = b"".join([part async for part in self.aiter_stream()])
|
| 467 |
+
return self._content
|
| 468 |
+
|
| 469 |
+
async def aiter_stream(self) -> typing.AsyncIterator[bytes]:
|
| 470 |
+
if not isinstance(self.stream, typing.AsyncIterable): # pragma: nocover
|
| 471 |
+
raise RuntimeError(
|
| 472 |
+
"Attempted to stream an synchronous response using 'async for ... in "
|
| 473 |
+
"response.aiter_stream()'. "
|
| 474 |
+
"You should use 'for ... in response.iter_stream()' instead."
|
| 475 |
+
)
|
| 476 |
+
if self._stream_consumed:
|
| 477 |
+
raise RuntimeError(
|
| 478 |
+
"Attempted to call 'async for ... in response.aiter_stream()' "
|
| 479 |
+
"more than once."
|
| 480 |
+
)
|
| 481 |
+
self._stream_consumed = True
|
| 482 |
+
async for chunk in self.stream:
|
| 483 |
+
yield chunk
|
| 484 |
+
|
| 485 |
+
async def aclose(self) -> None:
|
| 486 |
+
if not isinstance(self.stream, typing.AsyncIterable): # pragma: nocover
|
| 487 |
+
raise RuntimeError(
|
| 488 |
+
"Attempted to close a synchronous response using "
|
| 489 |
+
"'await response.aclose()'. "
|
| 490 |
+
"You should use 'response.close()' instead."
|
| 491 |
+
)
|
| 492 |
+
if hasattr(self.stream, "aclose"):
|
| 493 |
+
await self.stream.aclose()
|
| 494 |
+
|
| 495 |
+
|
| 496 |
+
class Proxy:
|
| 497 |
+
def __init__(
|
| 498 |
+
self,
|
| 499 |
+
url: URL | bytes | str,
|
| 500 |
+
auth: tuple[bytes | str, bytes | str] | None = None,
|
| 501 |
+
headers: HeadersAsMapping | HeadersAsSequence | None = None,
|
| 502 |
+
ssl_context: ssl.SSLContext | None = None,
|
| 503 |
+
):
|
| 504 |
+
self.url = enforce_url(url, name="url")
|
| 505 |
+
self.headers = enforce_headers(headers, name="headers")
|
| 506 |
+
self.ssl_context = ssl_context
|
| 507 |
+
|
| 508 |
+
if auth is not None:
|
| 509 |
+
username = enforce_bytes(auth[0], name="auth")
|
| 510 |
+
password = enforce_bytes(auth[1], name="auth")
|
| 511 |
+
userpass = username + b":" + password
|
| 512 |
+
authorization = b"Basic " + base64.b64encode(userpass)
|
| 513 |
+
self.auth: tuple[bytes, bytes] | None = (username, password)
|
| 514 |
+
self.headers = [(b"Proxy-Authorization", authorization)] + self.headers
|
| 515 |
+
else:
|
| 516 |
+
self.auth = None
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_ssl.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import ssl
|
| 2 |
+
|
| 3 |
+
import certifi
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def default_ssl_context() -> ssl.SSLContext:
|
| 7 |
+
context = ssl.create_default_context()
|
| 8 |
+
context.load_verify_locations(certifi.where())
|
| 9 |
+
return context
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_utils.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import select
|
| 4 |
+
import socket
|
| 5 |
+
import sys
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def is_socket_readable(sock: socket.socket | None) -> bool:
|
| 9 |
+
"""
|
| 10 |
+
Return whether a socket, as identifed by its file descriptor, is readable.
|
| 11 |
+
"A socket is readable" means that the read buffer isn't empty, i.e. that calling
|
| 12 |
+
.recv() on it would immediately return some data.
|
| 13 |
+
"""
|
| 14 |
+
# NOTE: we want check for readability without actually attempting to read, because
|
| 15 |
+
# we don't want to block forever if it's not readable.
|
| 16 |
+
|
| 17 |
+
# In the case that the socket no longer exists, or cannot return a file
|
| 18 |
+
# descriptor, we treat it as being readable, as if it the next read operation
|
| 19 |
+
# on it is ready to return the terminating `b""`.
|
| 20 |
+
sock_fd = None if sock is None else sock.fileno()
|
| 21 |
+
if sock_fd is None or sock_fd < 0: # pragma: nocover
|
| 22 |
+
return True
|
| 23 |
+
|
| 24 |
+
# The implementation below was stolen from:
|
| 25 |
+
# https://github.com/python-trio/trio/blob/20ee2b1b7376db637435d80e266212a35837ddcc/trio/_socket.py#L471-L478
|
| 26 |
+
# See also: https://github.com/encode/httpcore/pull/193#issuecomment-703129316
|
| 27 |
+
|
| 28 |
+
# Use select.select on Windows, and when poll is unavailable and select.poll
|
| 29 |
+
# everywhere else. (E.g. When eventlet is in use. See #327)
|
| 30 |
+
if (
|
| 31 |
+
sys.platform == "win32" or getattr(select, "poll", None) is None
|
| 32 |
+
): # pragma: nocover
|
| 33 |
+
rready, _, _ = select.select([sock_fd], [], [], 0)
|
| 34 |
+
return bool(rready)
|
| 35 |
+
p = select.poll()
|
| 36 |
+
p.register(sock_fd, select.POLLIN)
|
| 37 |
+
return bool(p.poll(0))
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/py.typed
ADDED
|
File without changes
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/jinja2-3.1.6.dist-info/INSTALLER
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pip
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/jinja2-3.1.6.dist-info/METADATA
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: Jinja2
|
| 3 |
+
Version: 3.1.6
|
| 4 |
+
Summary: A very fast and expressive template engine.
|
| 5 |
+
Maintainer-email: Pallets <contact@palletsprojects.com>
|
| 6 |
+
Requires-Python: >=3.7
|
| 7 |
+
Description-Content-Type: text/markdown
|
| 8 |
+
Classifier: Development Status :: 5 - Production/Stable
|
| 9 |
+
Classifier: Environment :: Web Environment
|
| 10 |
+
Classifier: Intended Audience :: Developers
|
| 11 |
+
Classifier: License :: OSI Approved :: BSD License
|
| 12 |
+
Classifier: Operating System :: OS Independent
|
| 13 |
+
Classifier: Programming Language :: Python
|
| 14 |
+
Classifier: Topic :: Internet :: WWW/HTTP :: Dynamic Content
|
| 15 |
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
| 16 |
+
Classifier: Typing :: Typed
|
| 17 |
+
License-File: LICENSE.txt
|
| 18 |
+
Requires-Dist: MarkupSafe>=2.0
|
| 19 |
+
Requires-Dist: Babel>=2.7 ; extra == "i18n"
|
| 20 |
+
Project-URL: Changes, https://jinja.palletsprojects.com/changes/
|
| 21 |
+
Project-URL: Chat, https://discord.gg/pallets
|
| 22 |
+
Project-URL: Documentation, https://jinja.palletsprojects.com/
|
| 23 |
+
Project-URL: Donate, https://palletsprojects.com/donate
|
| 24 |
+
Project-URL: Source, https://github.com/pallets/jinja/
|
| 25 |
+
Provides-Extra: i18n
|
| 26 |
+
|
| 27 |
+
# Jinja
|
| 28 |
+
|
| 29 |
+
Jinja is a fast, expressive, extensible templating engine. Special
|
| 30 |
+
placeholders in the template allow writing code similar to Python
|
| 31 |
+
syntax. Then the template is passed data to render the final document.
|
| 32 |
+
|
| 33 |
+
It includes:
|
| 34 |
+
|
| 35 |
+
- Template inheritance and inclusion.
|
| 36 |
+
- Define and import macros within templates.
|
| 37 |
+
- HTML templates can use autoescaping to prevent XSS from untrusted
|
| 38 |
+
user input.
|
| 39 |
+
- A sandboxed environment can safely render untrusted templates.
|
| 40 |
+
- AsyncIO support for generating templates and calling async
|
| 41 |
+
functions.
|
| 42 |
+
- I18N support with Babel.
|
| 43 |
+
- Templates are compiled to optimized Python code just-in-time and
|
| 44 |
+
cached, or can be compiled ahead-of-time.
|
| 45 |
+
- Exceptions point to the correct line in templates to make debugging
|
| 46 |
+
easier.
|
| 47 |
+
- Extensible filters, tests, functions, and even syntax.
|
| 48 |
+
|
| 49 |
+
Jinja's philosophy is that while application logic belongs in Python if
|
| 50 |
+
possible, it shouldn't make the template designer's job difficult by
|
| 51 |
+
restricting functionality too much.
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
## In A Nutshell
|
| 55 |
+
|
| 56 |
+
```jinja
|
| 57 |
+
{% extends "base.html" %}
|
| 58 |
+
{% block title %}Members{% endblock %}
|
| 59 |
+
{% block content %}
|
| 60 |
+
<ul>
|
| 61 |
+
{% for user in users %}
|
| 62 |
+
<li><a href="{{ user.url }}">{{ user.username }}</a></li>
|
| 63 |
+
{% endfor %}
|
| 64 |
+
</ul>
|
| 65 |
+
{% endblock %}
|
| 66 |
+
```
|
| 67 |
+
|
| 68 |
+
## Donate
|
| 69 |
+
|
| 70 |
+
The Pallets organization develops and supports Jinja and other popular
|
| 71 |
+
packages. In order to grow the community of contributors and users, and
|
| 72 |
+
allow the maintainers to devote more time to the projects, [please
|
| 73 |
+
donate today][].
|
| 74 |
+
|
| 75 |
+
[please donate today]: https://palletsprojects.com/donate
|
| 76 |
+
|
| 77 |
+
## Contributing
|
| 78 |
+
|
| 79 |
+
See our [detailed contributing documentation][contrib] for many ways to
|
| 80 |
+
contribute, including reporting issues, requesting features, asking or answering
|
| 81 |
+
questions, and making PRs.
|
| 82 |
+
|
| 83 |
+
[contrib]: https://palletsprojects.com/contributing/
|
| 84 |
+
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/jinja2-3.1.6.dist-info/RECORD
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
jinja2-3.1.6.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
| 2 |
+
jinja2-3.1.6.dist-info/METADATA,sha256=aMVUj7Z8QTKhOJjZsx7FDGvqKr3ZFdkh8hQ1XDpkmcg,2871
|
| 3 |
+
jinja2-3.1.6.dist-info/RECORD,,
|
| 4 |
+
jinja2-3.1.6.dist-info/WHEEL,sha256=_2ozNFCLWc93bK4WKHCO-eDUENDlo-dgc9cU3qokYO4,82
|
| 5 |
+
jinja2-3.1.6.dist-info/entry_points.txt,sha256=OL85gYU1eD8cuPlikifFngXpeBjaxl6rIJ8KkC_3r-I,58
|
| 6 |
+
jinja2-3.1.6.dist-info/licenses/LICENSE.txt,sha256=O0nc7kEF6ze6wQ-vG-JgQI_oXSUrjp3y4JefweCUQ3s,1475
|
| 7 |
+
jinja2/__init__.py,sha256=xxepO9i7DHsqkQrgBEduLtfoz2QCuT6_gbL4XSN1hbU,1928
|
| 8 |
+
jinja2/__pycache__/__init__.cpython-312.pyc,,
|
| 9 |
+
jinja2/__pycache__/_identifier.cpython-312.pyc,,
|
| 10 |
+
jinja2/__pycache__/async_utils.cpython-312.pyc,,
|
| 11 |
+
jinja2/__pycache__/bccache.cpython-312.pyc,,
|
| 12 |
+
jinja2/__pycache__/compiler.cpython-312.pyc,,
|
| 13 |
+
jinja2/__pycache__/constants.cpython-312.pyc,,
|
| 14 |
+
jinja2/__pycache__/debug.cpython-312.pyc,,
|
| 15 |
+
jinja2/__pycache__/defaults.cpython-312.pyc,,
|
| 16 |
+
jinja2/__pycache__/environment.cpython-312.pyc,,
|
| 17 |
+
jinja2/__pycache__/exceptions.cpython-312.pyc,,
|
| 18 |
+
jinja2/__pycache__/ext.cpython-312.pyc,,
|
| 19 |
+
jinja2/__pycache__/filters.cpython-312.pyc,,
|
| 20 |
+
jinja2/__pycache__/idtracking.cpython-312.pyc,,
|
| 21 |
+
jinja2/__pycache__/lexer.cpython-312.pyc,,
|
| 22 |
+
jinja2/__pycache__/loaders.cpython-312.pyc,,
|
| 23 |
+
jinja2/__pycache__/meta.cpython-312.pyc,,
|
| 24 |
+
jinja2/__pycache__/nativetypes.cpython-312.pyc,,
|
| 25 |
+
jinja2/__pycache__/nodes.cpython-312.pyc,,
|
| 26 |
+
jinja2/__pycache__/optimizer.cpython-312.pyc,,
|
| 27 |
+
jinja2/__pycache__/parser.cpython-312.pyc,,
|
| 28 |
+
jinja2/__pycache__/runtime.cpython-312.pyc,,
|
| 29 |
+
jinja2/__pycache__/sandbox.cpython-312.pyc,,
|
| 30 |
+
jinja2/__pycache__/tests.cpython-312.pyc,,
|
| 31 |
+
jinja2/__pycache__/utils.cpython-312.pyc,,
|
| 32 |
+
jinja2/__pycache__/visitor.cpython-312.pyc,,
|
| 33 |
+
jinja2/_identifier.py,sha256=_zYctNKzRqlk_murTNlzrju1FFJL7Va_Ijqqd7ii2lU,1958
|
| 34 |
+
jinja2/async_utils.py,sha256=vK-PdsuorOMnWSnEkT3iUJRIkTnYgO2T6MnGxDgHI5o,2834
|
| 35 |
+
jinja2/bccache.py,sha256=gh0qs9rulnXo0PhX5jTJy2UHzI8wFnQ63o_vw7nhzRg,14061
|
| 36 |
+
jinja2/compiler.py,sha256=9RpCQl5X88BHllJiPsHPh295Hh0uApvwFJNQuutULeM,74131
|
| 37 |
+
jinja2/constants.py,sha256=GMoFydBF_kdpaRKPoM5cl5MviquVRLVyZtfp5-16jg0,1433
|
| 38 |
+
jinja2/debug.py,sha256=CnHqCDHd-BVGvti_8ZsTolnXNhA3ECsY-6n_2pwU8Hw,6297
|
| 39 |
+
jinja2/defaults.py,sha256=boBcSw78h-lp20YbaXSJsqkAI2uN_mD_TtCydpeq5wU,1267
|
| 40 |
+
jinja2/environment.py,sha256=9nhrP7Ch-NbGX00wvyr4yy-uhNHq2OCc60ggGrni_fk,61513
|
| 41 |
+
jinja2/exceptions.py,sha256=ioHeHrWwCWNaXX1inHmHVblvc4haO7AXsjCp3GfWvx0,5071
|
| 42 |
+
jinja2/ext.py,sha256=5PF5eHfh8mXAIxXHHRB2xXbXohi8pE3nHSOxa66uS7E,31875
|
| 43 |
+
jinja2/filters.py,sha256=PQ_Egd9n9jSgtnGQYyF4K5j2nYwhUIulhPnyimkdr-k,55212
|
| 44 |
+
jinja2/idtracking.py,sha256=-ll5lIp73pML3ErUYiIJj7tdmWxcH_IlDv3yA_hiZYo,10555
|
| 45 |
+
jinja2/lexer.py,sha256=LYiYio6br-Tep9nPcupWXsPEtjluw3p1mU-lNBVRUfk,29786
|
| 46 |
+
jinja2/loaders.py,sha256=wIrnxjvcbqh5VwW28NSkfotiDq8qNCxIOSFbGUiSLB4,24055
|
| 47 |
+
jinja2/meta.py,sha256=OTDPkaFvU2Hgvx-6akz7154F8BIWaRmvJcBFvwopHww,4397
|
| 48 |
+
jinja2/nativetypes.py,sha256=7GIGALVJgdyL80oZJdQUaUfwSt5q2lSSZbXt0dNf_M4,4210
|
| 49 |
+
jinja2/nodes.py,sha256=m1Duzcr6qhZI8JQ6VyJgUNinjAf5bQzijSmDnMsvUx8,34579
|
| 50 |
+
jinja2/optimizer.py,sha256=rJnCRlQ7pZsEEmMhsQDgC_pKyDHxP5TPS6zVPGsgcu8,1651
|
| 51 |
+
jinja2/parser.py,sha256=lLOFy3sEmHc5IaEHRiH1sQVnId2moUQzhyeJZTtdY30,40383
|
| 52 |
+
jinja2/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 53 |
+
jinja2/runtime.py,sha256=gDk-GvdriJXqgsGbHgrcKTP0Yp6zPXzhzrIpCFH3jAU,34249
|
| 54 |
+
jinja2/sandbox.py,sha256=Mw2aitlY2I8la7FYhcX2YG9BtUYcLnD0Gh3d29cDWrY,15009
|
| 55 |
+
jinja2/tests.py,sha256=VLsBhVFnWg-PxSBz1MhRnNWgP1ovXk3neO1FLQMeC9Q,5926
|
| 56 |
+
jinja2/utils.py,sha256=rRp3o9e7ZKS4fyrWRbELyLcpuGVTFcnooaOa1qx_FIk,24129
|
| 57 |
+
jinja2/visitor.py,sha256=EcnL1PIwf_4RVCOMxsRNuR8AXHbS1qfAdMOE2ngKJz4,3557
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/jinja2-3.1.6.dist-info/WHEEL
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Wheel-Version: 1.0
|
| 2 |
+
Generator: flit 3.11.0
|
| 3 |
+
Root-Is-Purelib: true
|
| 4 |
+
Tag: py3-none-any
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/jinja2-3.1.6.dist-info/entry_points.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[babel.extractors]
|
| 2 |
+
jinja2=jinja2.ext:babel_extract[i18n]
|
| 3 |
+
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml-6.0.2.dist-info/INSTALLER
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pip
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml-6.0.2.dist-info/METADATA
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: lxml
|
| 3 |
+
Version: 6.0.2
|
| 4 |
+
Summary: Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API.
|
| 5 |
+
Home-page: https://lxml.de/
|
| 6 |
+
Author: lxml dev team
|
| 7 |
+
Author-email: lxml@lxml.de
|
| 8 |
+
Maintainer: lxml dev team
|
| 9 |
+
Maintainer-email: lxml@lxml.de
|
| 10 |
+
License: BSD-3-Clause
|
| 11 |
+
Project-URL: Source, https://github.com/lxml/lxml
|
| 12 |
+
Project-URL: Bug Tracker, https://bugs.launchpad.net/lxml
|
| 13 |
+
Classifier: Development Status :: 5 - Production/Stable
|
| 14 |
+
Classifier: Intended Audience :: Developers
|
| 15 |
+
Classifier: Intended Audience :: Information Technology
|
| 16 |
+
Classifier: Programming Language :: Cython
|
| 17 |
+
Classifier: Programming Language :: Python :: 3
|
| 18 |
+
Classifier: Programming Language :: Python :: 3.8
|
| 19 |
+
Classifier: Programming Language :: Python :: 3.9
|
| 20 |
+
Classifier: Programming Language :: Python :: 3.10
|
| 21 |
+
Classifier: Programming Language :: Python :: 3.11
|
| 22 |
+
Classifier: Programming Language :: Python :: 3.12
|
| 23 |
+
Classifier: Programming Language :: Python :: 3.13
|
| 24 |
+
Classifier: Programming Language :: Python :: 3.14
|
| 25 |
+
Classifier: Programming Language :: C
|
| 26 |
+
Classifier: Operating System :: OS Independent
|
| 27 |
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
| 28 |
+
Classifier: Topic :: Text Processing :: Markup :: XML
|
| 29 |
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
| 30 |
+
Requires-Python: >=3.8
|
| 31 |
+
License-File: LICENSE.txt
|
| 32 |
+
License-File: LICENSES.txt
|
| 33 |
+
Provides-Extra: source
|
| 34 |
+
Provides-Extra: cssselect
|
| 35 |
+
Requires-Dist: cssselect>=0.7; extra == "cssselect"
|
| 36 |
+
Provides-Extra: html5
|
| 37 |
+
Requires-Dist: html5lib; extra == "html5"
|
| 38 |
+
Provides-Extra: htmlsoup
|
| 39 |
+
Requires-Dist: BeautifulSoup4; extra == "htmlsoup"
|
| 40 |
+
Provides-Extra: html-clean
|
| 41 |
+
Requires-Dist: lxml_html_clean; extra == "html-clean"
|
| 42 |
+
Dynamic: author
|
| 43 |
+
Dynamic: author-email
|
| 44 |
+
Dynamic: classifier
|
| 45 |
+
Dynamic: description
|
| 46 |
+
Dynamic: home-page
|
| 47 |
+
Dynamic: license
|
| 48 |
+
Dynamic: license-file
|
| 49 |
+
Dynamic: maintainer
|
| 50 |
+
Dynamic: maintainer-email
|
| 51 |
+
Dynamic: project-url
|
| 52 |
+
Dynamic: provides-extra
|
| 53 |
+
Dynamic: requires-python
|
| 54 |
+
Dynamic: summary
|
| 55 |
+
|
| 56 |
+
lxml is a Pythonic, mature binding for the libxml2 and libxslt libraries.
|
| 57 |
+
It provides safe and convenient access to these libraries using the
|
| 58 |
+
ElementTree API.
|
| 59 |
+
|
| 60 |
+
It extends the ElementTree API significantly to offer support for XPath,
|
| 61 |
+
RelaxNG, XML Schema, XSLT, C14N and much more.
|
| 62 |
+
|
| 63 |
+
To contact the project, go to the `project home page <https://lxml.de/>`_
|
| 64 |
+
or see our bug tracker at https://launchpad.net/lxml
|
| 65 |
+
|
| 66 |
+
In case you want to use the current in-development version of lxml,
|
| 67 |
+
you can get it from the github repository at
|
| 68 |
+
https://github.com/lxml/lxml . Note that this requires Cython to
|
| 69 |
+
build the sources, see the build instructions on the project home page.
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
After an official release of a new stable series, bug fixes may become available at
|
| 73 |
+
https://github.com/lxml/lxml/tree/lxml-6.0 .
|
| 74 |
+
Running ``pip install https://github.com/lxml/lxml/archive/refs/heads/lxml-6.0.tar.gz``
|
| 75 |
+
will install the unreleased branch state as soon as a maintenance branch has been established.
|
| 76 |
+
Note that this requires Cython to be installed at an appropriate version for the build.
|
| 77 |
+
|
| 78 |
+
6.0.2 (2025-09-21)
|
| 79 |
+
==================
|
| 80 |
+
|
| 81 |
+
Bugs fixed
|
| 82 |
+
----------
|
| 83 |
+
|
| 84 |
+
* LP#2125278: Compilation with libxml2 2.15.0 failed.
|
| 85 |
+
Original patch by Xi Ruoyao.
|
| 86 |
+
|
| 87 |
+
* Setting ``decompress=True`` in the parser had no effect in libxml2 2.15.
|
| 88 |
+
|
| 89 |
+
* Binary wheels on Linux and macOS use the library version libxml2 2.14.6.
|
| 90 |
+
See https://gitlab.gnome.org/GNOME/libxml2/-/releases/v2.14.6
|
| 91 |
+
|
| 92 |
+
* Test failures in libxml2 2.15.0 were fixed.
|
| 93 |
+
|
| 94 |
+
Other changes
|
| 95 |
+
-------------
|
| 96 |
+
|
| 97 |
+
* Binary wheels for Py3.9-3.11 on the ``riscv64`` architecture were added.
|
| 98 |
+
|
| 99 |
+
* Error constants were updated to match libxml2 2.15.0.
|
| 100 |
+
|
| 101 |
+
* Built using Cython 3.1.4.
|
| 102 |
+
|
| 103 |
+
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml-6.0.2.dist-info/RECORD
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
lxml-6.0.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
| 2 |
+
lxml-6.0.2.dist-info/METADATA,sha256=0qIHkwlNTTMz4-c5e8ZnbbGgt_vpYZHCEoqXyckR95Q,3622
|
| 3 |
+
lxml-6.0.2.dist-info/RECORD,,
|
| 4 |
+
lxml-6.0.2.dist-info/WHEEL,sha256=1rk9WkINO5IYd_dGyocTHV6htge3I27wu_Vax8WCadA,152
|
| 5 |
+
lxml-6.0.2.dist-info/licenses/LICENSE.txt,sha256=j8K1aBM1FuRoRdIUeRet7uFkjnCumrXtbFQXr-9M6FU,1507
|
| 6 |
+
lxml-6.0.2.dist-info/licenses/LICENSES.txt,sha256=QdSd1AaqDhVIptXyGjDWv2OLPNlutyid00jYPtLkA5I,1514
|
| 7 |
+
lxml-6.0.2.dist-info/top_level.txt,sha256=NjD988wqaKq512nshNdLt-uDxsjkp4Bh51m6N-dhUrk,5
|
| 8 |
+
lxml/ElementInclude.py,sha256=PSLeZFvCa76WHJulPLxcZXJtCI2-4dK2CtqPRiYOAQg,8560
|
| 9 |
+
lxml/__init__.py,sha256=rgOcPyZUNBFL30ylxIxd8fHHWi6TwyIUCi8Av84XWwo,574
|
| 10 |
+
lxml/__pycache__/ElementInclude.cpython-312.pyc,,
|
| 11 |
+
lxml/__pycache__/__init__.cpython-312.pyc,,
|
| 12 |
+
lxml/__pycache__/_elementpath.cpython-312.pyc,,
|
| 13 |
+
lxml/__pycache__/builder.cpython-312.pyc,,
|
| 14 |
+
lxml/__pycache__/cssselect.cpython-312.pyc,,
|
| 15 |
+
lxml/__pycache__/doctestcompare.cpython-312.pyc,,
|
| 16 |
+
lxml/__pycache__/pyclasslookup.cpython-312.pyc,,
|
| 17 |
+
lxml/__pycache__/sax.cpython-312.pyc,,
|
| 18 |
+
lxml/__pycache__/usedoctest.cpython-312.pyc,,
|
| 19 |
+
lxml/_elementpath.cpython-312-x86_64-linux-gnu.so,sha256=1mB7tnIOx_08TqlYHQQSYJX5SXE4lQZrrnexJZBuvi8,217352
|
| 20 |
+
lxml/_elementpath.py,sha256=b80hM3ndAkTtRX6v54za3LkkAqCcd0700BbMPZHnTBU,10959
|
| 21 |
+
lxml/apihelpers.pxi,sha256=9S6bzp-VKCUPZv0f6-el5PsbPFN4FJqSnMCIYilS0eU,63881
|
| 22 |
+
lxml/builder.cpython-312-x86_64-linux-gnu.so,sha256=iSov_1syOR8dCLyAPsAlfGOkc67Yl1GX7I93Af993ZI,129080
|
| 23 |
+
lxml/builder.py,sha256=KI1HxHTd4wJqqVfmTRtSbXBQdl2T-P36ih4hT-J3MNw,8485
|
| 24 |
+
lxml/classlookup.pxi,sha256=Tax8Vhbm5C6UCjgmRFsYjW0pFHxIuTthH1MOgASDLgc,22435
|
| 25 |
+
lxml/cleanup.pxi,sha256=ZNEpbv7qx_ICPzsxhCaMUHCOfiznOoZ_u3jlYXHAuh4,8454
|
| 26 |
+
lxml/cssselect.py,sha256=_wZdX-B9p5MeIYABmENIYRWEkwXwX-7jO8Dkf-1rUZU,3306
|
| 27 |
+
lxml/debug.pxi,sha256=KTcpR8-slUYvmIPbE35GoHDNTb-gjTEvD7bw6LltM4c,1125
|
| 28 |
+
lxml/docloader.pxi,sha256=bYSZAxxbBEfVzfLXTUWFRfOyUTfV23L7i9hR2dgtSNY,5772
|
| 29 |
+
lxml/doctestcompare.py,sha256=40EDnkwpcvW86qNa86990OXF42xdHaosSZoiBsEjkzU,17731
|
| 30 |
+
lxml/dtd.pxi,sha256=IAKkmA4ZoC68sqAWcTqoS8jEGYcPQrVMCZgn4iLBYko,15281
|
| 31 |
+
lxml/etree.cpython-312-x86_64-linux-gnu.so,sha256=4SybuGGBSJ2dF8AZo5PSuo8BaiLbT3eF8sofIH2RT_U,5395056
|
| 32 |
+
lxml/etree.h,sha256=_NkGkD3C_jpE4UegvQ6Y32_ycTbUCLyOBz9xfWRPkug,9792
|
| 33 |
+
lxml/etree.pyx,sha256=2qCb8ZNjsdoB0fUELYwAM4ldLQZWS5_gt-OxKEUM-vs,138014
|
| 34 |
+
lxml/etree_api.h,sha256=dNCm28ubaVS8SbhLuxs9JvYWg41NoR_yD3qTRr7hliA,17372
|
| 35 |
+
lxml/extensions.pxi,sha256=xKLad35EQgpsDhs07tw31aKJBBMWIK9rMc0JTXETAUA,32022
|
| 36 |
+
lxml/html/ElementSoup.py,sha256=s_dLobLMuKn2DhexR-iDXdZrMFg1RjLy1feHsIeZMpw,320
|
| 37 |
+
lxml/html/__init__.py,sha256=CC5WdsvSptZhr9MZya1qsL6JKVbviYdrHOhXrGhmORg,64425
|
| 38 |
+
lxml/html/__pycache__/ElementSoup.cpython-312.pyc,,
|
| 39 |
+
lxml/html/__pycache__/__init__.cpython-312.pyc,,
|
| 40 |
+
lxml/html/__pycache__/_diffcommand.cpython-312.pyc,,
|
| 41 |
+
lxml/html/__pycache__/_difflib.cpython-312.pyc,,
|
| 42 |
+
lxml/html/__pycache__/_html5builder.cpython-312.pyc,,
|
| 43 |
+
lxml/html/__pycache__/_setmixin.cpython-312.pyc,,
|
| 44 |
+
lxml/html/__pycache__/builder.cpython-312.pyc,,
|
| 45 |
+
lxml/html/__pycache__/clean.cpython-312.pyc,,
|
| 46 |
+
lxml/html/__pycache__/defs.cpython-312.pyc,,
|
| 47 |
+
lxml/html/__pycache__/diff.cpython-312.pyc,,
|
| 48 |
+
lxml/html/__pycache__/formfill.cpython-312.pyc,,
|
| 49 |
+
lxml/html/__pycache__/html5parser.cpython-312.pyc,,
|
| 50 |
+
lxml/html/__pycache__/soupparser.cpython-312.pyc,,
|
| 51 |
+
lxml/html/__pycache__/usedoctest.cpython-312.pyc,,
|
| 52 |
+
lxml/html/_diffcommand.py,sha256=kz_7EP9PmYWuczlZcGiw74_rG0eTKvQ2lrO0rkiwlYE,2081
|
| 53 |
+
lxml/html/_difflib.cpython-312-x86_64-linux-gnu.so,sha256=XuPeciCf-4e7FpclT9B1viDjUaTJVJg4zkeEW_zXauo,570296
|
| 54 |
+
lxml/html/_difflib.py,sha256=GgH_jVrZQC8tI8WV_lFZQsXFJ3mOTAPup1zjBJNvkPo,84954
|
| 55 |
+
lxml/html/_html5builder.py,sha256=NLaT-Ev-aBgJpeQl-6ZbJChLZK5GV-znDkHOJD5VQC4,3230
|
| 56 |
+
lxml/html/_setmixin.py,sha256=8IFIOLmVz0G-XzsD2tCEkSFWO-dgPBHgvHufC8ni67s,1188
|
| 57 |
+
lxml/html/builder.py,sha256=Uz3r5uiuCdoN0UPa7ngoLMwAadVIhslzGvlRPGigY_M,6187
|
| 58 |
+
lxml/html/clean.py,sha256=FghSJy4jt2RaBy6dgusowkU18hxpZ4XLE5ceCK9qxyA,503
|
| 59 |
+
lxml/html/defs.py,sha256=l_6nh4DHvrsVyWVqWCUUx14QiahRyZv4Melqy_thf6Q,4250
|
| 60 |
+
lxml/html/diff.cpython-312-x86_64-linux-gnu.so,sha256=iWcPoTRaf2StqEyPKB6xz1j15rvZDLvW_a-KwYLJLyY,377848
|
| 61 |
+
lxml/html/diff.py,sha256=Za0By-yeYlQEjUu7m7xKB288kKiy8VBS5gT0RPOaFY0,32989
|
| 62 |
+
lxml/html/formfill.py,sha256=umgk0BbkAI1W6q9musFbL-cDnI_aap2NsLBJqk0UmVI,9681
|
| 63 |
+
lxml/html/html5parser.py,sha256=dnyC4cqHxywjZSzk0mu2L7THTZjxhg4yF4pncjusa_w,8634
|
| 64 |
+
lxml/html/soupparser.py,sha256=xo8VvNeOEb-SChuXLKCRECh8J7HBiJLE9sAbEskoUUQ,10197
|
| 65 |
+
lxml/html/usedoctest.py,sha256=tPlmVz4KK1GRKV5DJLrdVECeqsT9PlDzSqqTodVi5s0,249
|
| 66 |
+
lxml/includes/__init__.pxd,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 67 |
+
lxml/includes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 68 |
+
lxml/includes/__pycache__/__init__.cpython-312.pyc,,
|
| 69 |
+
lxml/includes/c14n.pxd,sha256=DBQcOJ0c_YS245ohMb8fmuEC1kFyv1LrNY_8Mf-syZg,1110
|
| 70 |
+
lxml/includes/config.pxd,sha256=H6Mrl8It21hzRI2hzMId9W48QqkYYkoLT4dniLNmdTw,96
|
| 71 |
+
lxml/includes/dtdvalid.pxd,sha256=Nv0OykjYehv2lO-Zj--q6jS3TAC_dvQVPSgPMuse1NM,689
|
| 72 |
+
lxml/includes/etree_defs.h,sha256=h_UjJTmNUqPyKNNrWB9hxmt6v4CF7_83XVY8dOfxqW0,14524
|
| 73 |
+
lxml/includes/etreepublic.pxd,sha256=Bn4d3JkWPqXputXqI-eJ0xmPrwNFPTfDCa7axgjB7FM,10184
|
| 74 |
+
lxml/includes/extlibs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 75 |
+
lxml/includes/extlibs/__pycache__/__init__.cpython-312.pyc,,
|
| 76 |
+
lxml/includes/extlibs/libcharset.h,sha256=GA0FumrbNI4VDGlzq3lf5CLaCwXgn4unw2l0btGQFwI,1510
|
| 77 |
+
lxml/includes/extlibs/localcharset.h,sha256=Z_AagaQeq0aDE7NPsVOqEf4nO4KcUp46ggo4d0ONIOQ,6338
|
| 78 |
+
lxml/includes/extlibs/zconf.h,sha256=ROVD_0UUx6mgHWSAGcLJqB0RBcv6PHfx-vbNhur6ir0,16464
|
| 79 |
+
lxml/includes/extlibs/zlib.h,sha256=ilV5r3LqT0J_8ApBUPDMs_xcHkN59ybhARM7Grn8YAw,96829
|
| 80 |
+
lxml/includes/htmlparser.pxd,sha256=9uASkP5dU7OE2lCOLT-z2e01qSbFlp4ehgwdostF_qk,2802
|
| 81 |
+
lxml/includes/libexslt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 82 |
+
lxml/includes/libexslt/__pycache__/__init__.cpython-312.pyc,,
|
| 83 |
+
lxml/includes/libexslt/exslt.h,sha256=eSW5tMJAewSUANLqk7AGEiU8b2BbCNRyauHnez7nKSU,3114
|
| 84 |
+
lxml/includes/libexslt/exsltconfig.h,sha256=QHxzEbRlv_h0USBvpr0Zrl0Muzlc71VCrvgR6lqnLEY,1172
|
| 85 |
+
lxml/includes/libexslt/exsltexports.h,sha256=1Jm9KTXm2FUUJIZ6V6-Uw55yG0BMULX3_goyxDd2LL8,1077
|
| 86 |
+
lxml/includes/libxml/HTMLparser.h,sha256=sU4xGqj-vBtEvzlxA3hBPWJboifvkc4F1hynKXmsl3k,9569
|
| 87 |
+
lxml/includes/libxml/HTMLtree.h,sha256=Q7UBKFbQ8fx4d_dMnmR6ay8JmfOhopFkDp2B63YkLDU,3517
|
| 88 |
+
lxml/includes/libxml/SAX.h,sha256=SFnG27EFrYGUB9HDL_xSIGBwEns5pl07rApXWThFZFM,386
|
| 89 |
+
lxml/includes/libxml/SAX2.h,sha256=RfFP5o3le-Rg8bnA2GW7L7L9_pfXCs3TieODcv1DTWY,4240
|
| 90 |
+
lxml/includes/libxml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 91 |
+
lxml/includes/libxml/__pycache__/__init__.cpython-312.pyc,,
|
| 92 |
+
lxml/includes/libxml/c14n.h,sha256=BSBXw6nIZutC8mWvbRrLLmoWjw3wRt-nM93vjXGMCm8,2742
|
| 93 |
+
lxml/includes/libxml/catalog.h,sha256=H9ssTCaBjtDqc-AZqCk1R7h8F2iD9szqLjJyHpaczXg,4633
|
| 94 |
+
lxml/includes/libxml/chvalid.h,sha256=TZcceNp6Cw0QlYwIqK9GxyYqL5UiAjpQyjt_yrZGTQE,5087
|
| 95 |
+
lxml/includes/libxml/debugXML.h,sha256=XXRNI39gJW7bGRC4SzE4ad-SJ906BsUGz3AwOtkKuS4,1667
|
| 96 |
+
lxml/includes/libxml/dict.h,sha256=SweaPGMtTTf4je6dNTIoEzcfEvpsAT9_PhR7FC0K-rQ,1770
|
| 97 |
+
lxml/includes/libxml/encoding.h,sha256=haL7ratww2wkIERGmtwUqU2BbTVe52FZFU7MmrOpsPk,9623
|
| 98 |
+
lxml/includes/libxml/entities.h,sha256=LEOCA826-0f8dhRJzC_2hvUVsSH7lKQjrea9hSTdBbo,4419
|
| 99 |
+
lxml/includes/libxml/globals.h,sha256=NH8zyRI5cXJJGp5k2aLxOm-reJEGOFX6LYP82GBXRlY,583
|
| 100 |
+
lxml/includes/libxml/hash.h,sha256=KIIpAYKBfGUU3ydWhGehUyfuauZz_Ps0gyambzQo_rc,7017
|
| 101 |
+
lxml/includes/libxml/list.h,sha256=oh7iJNQajRA_cHsNk9CcFPYkaW2smf4J_MpedPPjC4k,3128
|
| 102 |
+
lxml/includes/libxml/nanoftp.h,sha256=22PBtWhJueYLFvwukt4oFooRct_xJA83hbluHRBNXUM,302
|
| 103 |
+
lxml/includes/libxml/nanohttp.h,sha256=bLbzYjAyAKmP3ComMOPH6XaUImu6bNAESF1HrVtRve0,2124
|
| 104 |
+
lxml/includes/libxml/parser.h,sha256=Uq7-ce55UUAsvo4n6CiBlNQpmowewvWhOsQtgGM1UQ8,48498
|
| 105 |
+
lxml/includes/libxml/parserInternals.h,sha256=8_Wr6UgRzm8BRn1RPLxyBkw6BagAdDvVqMA_e181_EI,14539
|
| 106 |
+
lxml/includes/libxml/relaxng.h,sha256=VXZ74r5Yja06KqypdBHc8neDwPxQ2aMrsWHSdRt5oi4,5991
|
| 107 |
+
lxml/includes/libxml/schemasInternals.h,sha256=V8M4In3zf24EX55Yt4dcfxwp7NpHGYViKnLKwtyrPJ4,26233
|
| 108 |
+
lxml/includes/libxml/schematron.h,sha256=8EhPDhvtlMxl9e0C5rSbEruOvzJS5BC_OOFbq9RXZnY,4255
|
| 109 |
+
lxml/includes/libxml/threads.h,sha256=mT3CgK4lXK7-NDnUOFXqYuCK6fyY70S3BsHF-TnT45k,1619
|
| 110 |
+
lxml/includes/libxml/tree.h,sha256=zTRLt6h5x6ApyeXgs90CKQZSAl2hKm7b5NxtPKUQFAE,36106
|
| 111 |
+
lxml/includes/libxml/uri.h,sha256=J9teJHme5z883c4twF5oImEYY-E3xSvhdSGpyRVtvIg,2855
|
| 112 |
+
lxml/includes/libxml/valid.h,sha256=By61IbPvk_eLux7a8x0mOaly7oclFaSGaFE8b2xZcUE,13226
|
| 113 |
+
lxml/includes/libxml/xinclude.h,sha256=K3I5jhw2zAMj26LuRNZc15Bwv2JE2hWxwVn4TCqv2b4,3258
|
| 114 |
+
lxml/includes/libxml/xlink.h,sha256=TVLOkISrcKDelo9n_XIUyPiStDYa8NxuF2dz70aBFCI,5062
|
| 115 |
+
lxml/includes/libxml/xmlIO.h,sha256=FvbuMYTy1-S5PScabE03wz0oWKf626pmXvOPZNuLm-w,11948
|
| 116 |
+
lxml/includes/libxml/xmlautomata.h,sha256=7Sc3YgPz1ZIBKCHPSxs5oAwJEZWQ1RT2kyUw85pUtmU,4004
|
| 117 |
+
lxml/includes/libxml/xmlerror.h,sha256=mMfltMxUza6kiSBfP2QfnY3UlMP_rEXKfX0wruBLl4A,37561
|
| 118 |
+
lxml/includes/libxml/xmlexports.h,sha256=IyV3AMeQVbOl0wkjlnNX4B8WUZ-5GNKQmxZc6-maWUU,2025
|
| 119 |
+
lxml/includes/libxml/xmlmemory.h,sha256=m7wGvVMxNzZiuOAo3vkjxaVWstc8aQLzb6obbjPsebE,4658
|
| 120 |
+
lxml/includes/libxml/xmlmodule.h,sha256=ERUHUmDdZRmh6NjLYWUpse51rLWR8qNjPHOtdgmlLF0,1198
|
| 121 |
+
lxml/includes/libxml/xmlreader.h,sha256=BAHinlSOTXX3DEax9BniaIIPAXJyLGfzym9R-27LCcU,12387
|
| 122 |
+
lxml/includes/libxml/xmlregexp.h,sha256=_q6C1XRy8DS3kSmLbEKpvkKQciTgjTJgGc_zUQ6m22M,2632
|
| 123 |
+
lxml/includes/libxml/xmlsave.h,sha256=zcEQr9sO5CsFrnoOLshhdsqMEr8k4AeFhbkYyNfO9Fs,2934
|
| 124 |
+
lxml/includes/libxml/xmlschemas.h,sha256=5AfLnYUcfmxHRzg0dVpdHig--4ui1-XDwDgpKGDKCiU,7067
|
| 125 |
+
lxml/includes/libxml/xmlschemastypes.h,sha256=MYwlGmoKAo3lHRaaKgnCXiLmPT9KRjdxyCJ7TEyZ6jM,4583
|
| 126 |
+
lxml/includes/libxml/xmlstring.h,sha256=d5PpqxP1I1sfmCUHvVJtjoC9h7hLHcAAQ5ok_Rtf50I,5271
|
| 127 |
+
lxml/includes/libxml/xmlunicode.h,sha256=8sq3wEW2AiyTCuc3ZceOEkce7lfrI7VnkRfwEQgc6pU,278
|
| 128 |
+
lxml/includes/libxml/xmlversion.h,sha256=oVpaE_xbttaeZNFKSuSfcLOceWz7LQgKP71Z1msXZNo,5112
|
| 129 |
+
lxml/includes/libxml/xmlwriter.h,sha256=BEUwYNKx3xymDE9vepksEK7yVq9SXYm1d2pQnzlPy90,20688
|
| 130 |
+
lxml/includes/libxml/xpath.h,sha256=CQv6X_pRhuXoCVpqoDXYB7FfusLK7AuPxCNigwhNYAA,16156
|
| 131 |
+
lxml/includes/libxml/xpathInternals.h,sha256=mc9B5tdpfssyz_NPUzww6dKuWCtBybBiBRJkTe4AE4U,18504
|
| 132 |
+
lxml/includes/libxml/xpointer.h,sha256=DAxMsfPp2SSZgXFrPbxBA84RwTMRf35Qg_LBbUzPQhA,1026
|
| 133 |
+
lxml/includes/libxslt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 134 |
+
lxml/includes/libxslt/__pycache__/__init__.cpython-312.pyc,,
|
| 135 |
+
lxml/includes/libxslt/attributes.h,sha256=qKwzfGf7r89esLC65s96iYJWRA-s-Ezss2_V6Mmo1hk,957
|
| 136 |
+
lxml/includes/libxslt/documents.h,sha256=kBihgH5pqRvFalhm_fOFHtJTFhTpBcm681yT5dxgwfw,2704
|
| 137 |
+
lxml/includes/libxslt/extensions.h,sha256=W5UMyJqUP_1zt6sXZ0mgc0gAIwDJrZ8gjByhyrWqvd8,6899
|
| 138 |
+
lxml/includes/libxslt/extra.h,sha256=6X3Wu3NdPtrlqz-Koo7dB-rccnnszi6j3zg599gTByg,1640
|
| 139 |
+
lxml/includes/libxslt/functions.h,sha256=fc4CZj-9KeBHzO9-WWU_bNqmaEZAz3n7NNwClIBXk14,1972
|
| 140 |
+
lxml/includes/libxslt/imports.h,sha256=18kIjoGqdFXR63Ce3ZtzxsTiYV3XGKpchYakMUPDuUI,1840
|
| 141 |
+
lxml/includes/libxslt/keys.h,sha256=16v25VEluS7jYhgg6gYFwVxgGMn-1ctnlhhWWT4RcBY,1155
|
| 142 |
+
lxml/includes/libxslt/namespaces.h,sha256=VofSn2Kkn-a5JyRKCmY3jPp7amQy3n09vzy0KUQt4q0,1666
|
| 143 |
+
lxml/includes/libxslt/numbersInternals.h,sha256=Eg5gYZ5p3h0_e5wyI61S-0E6_ArVJzv0yr63j6BU2fc,2019
|
| 144 |
+
lxml/includes/libxslt/pattern.h,sha256=tJ-BPfs9UYgiZMMoQZbhij3g7xVppYq7TrrOu25eR7Q,2110
|
| 145 |
+
lxml/includes/libxslt/preproc.h,sha256=D_LjEdHhsdyBnEAvflnwFgoR4hGUb72kgEhXkkmPRsw,896
|
| 146 |
+
lxml/includes/libxslt/security.h,sha256=fUD1cy_WxFCTvTNAF0WOQIU4p5CNWn1LHFyZJd-Fx5U,2652
|
| 147 |
+
lxml/includes/libxslt/templates.h,sha256=bnt6Jqui6KU5pNUdMNPbQZkZ5d-VTWqC0TMGkOlVoIo,2268
|
| 148 |
+
lxml/includes/libxslt/transform.h,sha256=ICT7meUV0OTAx27WaKVrKj-aUmR9LSpTNaOAJd2UStg,6311
|
| 149 |
+
lxml/includes/libxslt/variables.h,sha256=cQAgPe4QCcK2uKbWg7Iz-9peM9xWGm7m3M6jQm0sjIA,3143
|
| 150 |
+
lxml/includes/libxslt/xslt.h,sha256=wmFx2Q31Pd8Iq2phAQpY9J3QQatb8lWg3gABtqKFgEw,1964
|
| 151 |
+
lxml/includes/libxslt/xsltInternals.h,sha256=2EbEKYmnYZq0HjGnUMAlpqnqZJurRXzjlgk5Js1WYaY,57949
|
| 152 |
+
lxml/includes/libxslt/xsltconfig.h,sha256=cV5scdRK6xmOHeOg3OCw6hBfcQ_nrtNs_tKefX67304,2910
|
| 153 |
+
lxml/includes/libxslt/xsltexports.h,sha256=1-luH-0bCIgBAlKAXhV-dqHBfwOAQNDamiYbxIlTf0k,1124
|
| 154 |
+
lxml/includes/libxslt/xsltlocale.h,sha256=ppxGEmJfZIJgwRQzCM0_77p9WNekEWq1NrdYZrQl4IE,942
|
| 155 |
+
lxml/includes/libxslt/xsltutils.h,sha256=1eguYgR9-jeNOVlBUktHboaq-VLX6JXraO80TfbARKM,9085
|
| 156 |
+
lxml/includes/lxml-version.h,sha256=KZfk_lJnXSnxkyRdUV5taHsWJe4xbC6UEYfYldlfouI,71
|
| 157 |
+
lxml/includes/relaxng.pxd,sha256=HzHlQ6mCcf_tj_JZ9NAVJTVAv8ScCkE8Ifq15y3bS0c,2615
|
| 158 |
+
lxml/includes/schematron.pxd,sha256=Hob7xh-K-MKqp7WiG8thMagf5EkQzmgfi4ds0EF91JA,1604
|
| 159 |
+
lxml/includes/tree.pxd,sha256=XApzMRy_LSqCtQ-OTS-vNSW7CT_OWstybfIT2H84LsA,20179
|
| 160 |
+
lxml/includes/uri.pxd,sha256=3vOXw6AbSPxAM9uo71T1qnfx-wd9ezXLDQtWsb2zX0I,145
|
| 161 |
+
lxml/includes/xinclude.pxd,sha256=CuO_XZNB6E2JK1qXXWn11APrjFQV5kA6SMyb77WZn0A,804
|
| 162 |
+
lxml/includes/xmlerror.pxd,sha256=OQqayytkV0NigAPbsQCCcvmy7luRe0XhVzpTdzJjP3g,58837
|
| 163 |
+
lxml/includes/xmlparser.pxd,sha256=eDGyU5kZyNVksK0dUhMIi7rnE-LSevXsqyl72v99Ess,13730
|
| 164 |
+
lxml/includes/xmlschema.pxd,sha256=OLZPd2WDJyopiXJJyo-dAyyYHaeSYFiMAI4tqIiv-Ik,1702
|
| 165 |
+
lxml/includes/xpath.pxd,sha256=e8-ZYUbRG7N1mHETAlknJ_QqAteOosrYLRgpH-OsTkg,5603
|
| 166 |
+
lxml/includes/xslt.pxd,sha256=4yl3pOu7pAvsx5Tc-W4IWCoB8wgtSSR62HI1jqu6jko,8241
|
| 167 |
+
lxml/isoschematron/__init__.py,sha256=uauerYeKTlWFCJSqieIHhF5l6rYV2myeEJ0Imd1LzRc,13274
|
| 168 |
+
lxml/isoschematron/__pycache__/__init__.cpython-312.pyc,,
|
| 169 |
+
lxml/isoschematron/resources/rng/iso-schematron.rng,sha256=VsWxPyi3iViJDDbjJJw0wWkEHkLrz9zoCA8zJLor9N4,18337
|
| 170 |
+
lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl,sha256=ObebsB8Wt-d3uIA_U5NU85TpnQ3PxPX38TdOAqosMac,3172
|
| 171 |
+
lxml/isoschematron/resources/xsl/XSD2Schtrn.xsl,sha256=QweRrIIM-zFcgg98GXA2CaWfIbgVE0XKEeYSfvv67A0,4563
|
| 172 |
+
lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_abstract_expand.xsl,sha256=xSZ_Ekq_I-62ZpiE5AqYYHwFW_qh855zt9V4_s7rbkY,11703
|
| 173 |
+
lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_dsdl_include.xsl,sha256=x42QJ-dxQ1waPzydsCoQnp2Xj15y53nW43O7BuoDRHk,39957
|
| 174 |
+
lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_message.xsl,sha256=Tr9BnO6pzjVWwhqJfm10UlvAy95EgfSCz2iMlrVGT6Q,2015
|
| 175 |
+
lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_skeleton_for_xslt1.xsl,sha256=ue8q_88X4e_jsJizo31GRNBxNhdxkEE9fY20oq0Iqwk,71764
|
| 176 |
+
lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_svrl_for_xslt1.xsl,sha256=BBAdsVSi5zAzeGepuN6gS1saQINDqITXKplmmj4dTWg,20382
|
| 177 |
+
lxml/isoschematron/resources/xsl/iso-schematron-xslt1/readme.txt,sha256=OGLiFswuLJEW5EPYKOeoauuCJFEtVa6jyzBE1OcJI98,3310
|
| 178 |
+
lxml/iterparse.pxi,sha256=JXvYhSOCaRjT_hYbRGMlJt2rlqx0TiRpN4FE1jQc63w,16521
|
| 179 |
+
lxml/lxml.etree.h,sha256=_NkGkD3C_jpE4UegvQ6Y32_ycTbUCLyOBz9xfWRPkug,9792
|
| 180 |
+
lxml/lxml.etree_api.h,sha256=dAbJPd53D_9CIGzePAUB3otgyhG4o2cSdA4-6apdzRA,17377
|
| 181 |
+
lxml/nsclasses.pxi,sha256=5pzNBhBtlqObPdThL9QIGRs1Dxj1qnr0PyXuTCURqTg,9129
|
| 182 |
+
lxml/objectify.cpython-312-x86_64-linux-gnu.so,sha256=TYF3CoGF-cenIwFh_1nY0sr2UI2wdsS8tZO2Wi0evyg,2933112
|
| 183 |
+
lxml/objectify.pyx,sha256=I4bQQXmQssBtk5bTrid-eVURBLKRTM5iQZiviugIrts,75823
|
| 184 |
+
lxml/objectpath.pxi,sha256=s5TNG2-EbaWWKLFAiX303B95zK_Ui8ausB__3QvFFGw,11450
|
| 185 |
+
lxml/parser.pxi,sha256=VZfychEJ3-XPE3x6oGOEzn6HVAr74R7lXfDSVF-hq-U,85411
|
| 186 |
+
lxml/parsertarget.pxi,sha256=v1PidxRaG5giwXcTDkpBI7PDFmsZuOcK0y9LdkQaY8M,6326
|
| 187 |
+
lxml/proxy.pxi,sha256=8IVvYF2KTuzl7Hb3XGHEmcxfSLbUZkA2Q1Y50hLsyzE,23929
|
| 188 |
+
lxml/public-api.pxi,sha256=XoP6_cJOEoQIItvE1RiYCKYD1ry4AobaOr4XLo0KSE4,6666
|
| 189 |
+
lxml/pyclasslookup.py,sha256=gLD1HM2HtITYYiGzjEOewSwbB7XkVx_NZv_quCt79Oc,92
|
| 190 |
+
lxml/readonlytree.pxi,sha256=ddRYczhHieJ4XUvWvTPW9N9oQ8vuKtv7lC1mtE1qvH8,18976
|
| 191 |
+
lxml/relaxng.pxi,sha256=3OQ-fZMzP-KF5vM6HTozT_9ee3J0DJnpj9RcHC8LoMw,6339
|
| 192 |
+
lxml/sax.cpython-312-x86_64-linux-gnu.so,sha256=UQn-l56AOOT5UUJ395Fil7It-Im_brnlsMYfmUpwQe0,190272
|
| 193 |
+
lxml/sax.py,sha256=yrNvKD6rlon48jrR-1qpFXER8j4psYC2R5yt0u6TWLs,9706
|
| 194 |
+
lxml/saxparser.pxi,sha256=TmkdM5h9xII9iKRaBk_1NGk2KTfeesl5Ha8bpFQGqLc,33529
|
| 195 |
+
lxml/schematron.pxi,sha256=F2OHKZUl57-byBk_wWtPTnHZ1fwlj0FtwG3VuGtG-UY,6064
|
| 196 |
+
lxml/serializer.pxi,sha256=iIXfechFHfvFs2sTk7wMIy3sDJxmaMPbNO33mkLLBUE,68063
|
| 197 |
+
lxml/usedoctest.py,sha256=qRgZKQVcAZcl-zN0AIXVJnOsETUXz2nPXkxuzs1lGgk,230
|
| 198 |
+
lxml/xinclude.pxi,sha256=7eBrI_OK47mmrHQ0ixbixRI8pKqQ1nwkMV-OmKUVlD4,2456
|
| 199 |
+
lxml/xmlerror.pxi,sha256=i1kR42WB2BAxtrmh7m2ADlH-jffVQ-blW3pW0Ps4s-g,50061
|
| 200 |
+
lxml/xmlid.pxi,sha256=5zf9oR6bsCtavGiOmilNyHqYwgG_bnrIabSd2SURtm0,6073
|
| 201 |
+
lxml/xmlschema.pxi,sha256=mumNoHni5S3BQPtcmOHRd61KRaVWu4eOie2wQeB0e6E,8490
|
| 202 |
+
lxml/xpath.pxi,sha256=aqW24V817dUxps4Gnc8h7Tm3QVlITKvxU5_9WgJUIFg,19132
|
| 203 |
+
lxml/xslt.pxi,sha256=wxdbuvNFVA8eP57tHmBYWER__ceFhf6HGdsbBHbx_0A,36315
|
| 204 |
+
lxml/xsltext.pxi,sha256=TImDiAPlAezC07P7RY1N9YChA7AuKFH-G53hXdel9yc,11088
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml-6.0.2.dist-info/WHEEL
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Wheel-Version: 1.0
|
| 2 |
+
Generator: setuptools (80.9.0)
|
| 3 |
+
Root-Is-Purelib: false
|
| 4 |
+
Tag: cp312-cp312-manylinux_2_26_x86_64
|
| 5 |
+
Tag: cp312-cp312-manylinux_2_28_x86_64
|
| 6 |
+
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml-6.0.2.dist-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
lxml
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia_curand_cu12-10.3.9.90.dist-info/INSTALLER
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
pip
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia_curand_cu12-10.3.9.90.dist-info/License.txt
ADDED
|
@@ -0,0 +1,1568 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
End User License Agreement
|
| 2 |
+
--------------------------
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
Preface
|
| 6 |
+
-------
|
| 7 |
+
|
| 8 |
+
The Software License Agreement in Chapter 1 and the Supplement
|
| 9 |
+
in Chapter 2 contain license terms and conditions that govern
|
| 10 |
+
the use of NVIDIA software. By accepting this agreement, you
|
| 11 |
+
agree to comply with all the terms and conditions applicable
|
| 12 |
+
to the product(s) included herein.
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
NVIDIA Driver
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
Description
|
| 19 |
+
|
| 20 |
+
This package contains the operating system driver and
|
| 21 |
+
fundamental system software components for NVIDIA GPUs.
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
NVIDIA CUDA Toolkit
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
Description
|
| 28 |
+
|
| 29 |
+
The NVIDIA CUDA Toolkit provides command-line and graphical
|
| 30 |
+
tools for building, debugging and optimizing the performance
|
| 31 |
+
of applications accelerated by NVIDIA GPUs, runtime and math
|
| 32 |
+
libraries, and documentation including programming guides,
|
| 33 |
+
user manuals, and API references.
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
Default Install Location of CUDA Toolkit
|
| 37 |
+
|
| 38 |
+
Windows platform:
|
| 39 |
+
|
| 40 |
+
%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v#.#
|
| 41 |
+
|
| 42 |
+
Linux platform:
|
| 43 |
+
|
| 44 |
+
/usr/local/cuda-#.#
|
| 45 |
+
|
| 46 |
+
Mac platform:
|
| 47 |
+
|
| 48 |
+
/Developer/NVIDIA/CUDA-#.#
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
NVIDIA CUDA Samples
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
Description
|
| 55 |
+
|
| 56 |
+
This package includes over 100+ CUDA examples that demonstrate
|
| 57 |
+
various CUDA programming principles, and efficient CUDA
|
| 58 |
+
implementation of algorithms in specific application domains.
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
Default Install Location of CUDA Samples
|
| 62 |
+
|
| 63 |
+
Windows platform:
|
| 64 |
+
|
| 65 |
+
%ProgramData%\NVIDIA Corporation\CUDA Samples\v#.#
|
| 66 |
+
|
| 67 |
+
Linux platform:
|
| 68 |
+
|
| 69 |
+
/usr/local/cuda-#.#/samples
|
| 70 |
+
|
| 71 |
+
and
|
| 72 |
+
|
| 73 |
+
$HOME/NVIDIA_CUDA-#.#_Samples
|
| 74 |
+
|
| 75 |
+
Mac platform:
|
| 76 |
+
|
| 77 |
+
/Developer/NVIDIA/CUDA-#.#/samples
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
NVIDIA Nsight Visual Studio Edition (Windows only)
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
Description
|
| 84 |
+
|
| 85 |
+
NVIDIA Nsight Development Platform, Visual Studio Edition is a
|
| 86 |
+
development environment integrated into Microsoft Visual
|
| 87 |
+
Studio that provides tools for debugging, profiling, analyzing
|
| 88 |
+
and optimizing your GPU computing and graphics applications.
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
Default Install Location of Nsight Visual Studio Edition
|
| 92 |
+
|
| 93 |
+
Windows platform:
|
| 94 |
+
|
| 95 |
+
%ProgramFiles(x86)%\NVIDIA Corporation\Nsight Visual Studio Edition #.#
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
1. License Agreement for NVIDIA Software Development Kits
|
| 99 |
+
---------------------------------------------------------
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
Release Date: July 26, 2018
|
| 103 |
+
---------------------------
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
Important NoticeRead before downloading, installing,
|
| 107 |
+
copying or using the licensed software:
|
| 108 |
+
-------------------------------------------------------
|
| 109 |
+
|
| 110 |
+
This license agreement, including exhibits attached
|
| 111 |
+
("Agreement”) is a legal agreement between you and NVIDIA
|
| 112 |
+
Corporation ("NVIDIA") and governs your use of a NVIDIA
|
| 113 |
+
software development kit (“SDK”).
|
| 114 |
+
|
| 115 |
+
Each SDK has its own set of software and materials, but here
|
| 116 |
+
is a description of the types of items that may be included in
|
| 117 |
+
a SDK: source code, header files, APIs, data sets and assets
|
| 118 |
+
(examples include images, textures, models, scenes, videos,
|
| 119 |
+
native API input/output files), binary software, sample code,
|
| 120 |
+
libraries, utility programs, programming code and
|
| 121 |
+
documentation.
|
| 122 |
+
|
| 123 |
+
This Agreement can be accepted only by an adult of legal age
|
| 124 |
+
of majority in the country in which the SDK is used.
|
| 125 |
+
|
| 126 |
+
If you are entering into this Agreement on behalf of a company
|
| 127 |
+
or other legal entity, you represent that you have the legal
|
| 128 |
+
authority to bind the entity to this Agreement, in which case
|
| 129 |
+
“you” will mean the entity you represent.
|
| 130 |
+
|
| 131 |
+
If you don’t have the required age or authority to accept
|
| 132 |
+
this Agreement, or if you don’t accept all the terms and
|
| 133 |
+
conditions of this Agreement, do not download, install or use
|
| 134 |
+
the SDK.
|
| 135 |
+
|
| 136 |
+
You agree to use the SDK only for purposes that are permitted
|
| 137 |
+
by (a) this Agreement, and (b) any applicable law, regulation
|
| 138 |
+
or generally accepted practices or guidelines in the relevant
|
| 139 |
+
jurisdictions.
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
1.1. License
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
1.1.1. License Grant
|
| 146 |
+
|
| 147 |
+
Subject to the terms of this Agreement, NVIDIA hereby grants
|
| 148 |
+
you a non-exclusive, non-transferable license, without the
|
| 149 |
+
right to sublicense (except as expressly provided in this
|
| 150 |
+
Agreement) to:
|
| 151 |
+
|
| 152 |
+
1. Install and use the SDK,
|
| 153 |
+
|
| 154 |
+
2. Modify and create derivative works of sample source code
|
| 155 |
+
delivered in the SDK, and
|
| 156 |
+
|
| 157 |
+
3. Distribute those portions of the SDK that are identified
|
| 158 |
+
in this Agreement as distributable, as incorporated in
|
| 159 |
+
object code format into a software application that meets
|
| 160 |
+
the distribution requirements indicated in this Agreement.
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
1.1.2. Distribution Requirements
|
| 164 |
+
|
| 165 |
+
These are the distribution requirements for you to exercise
|
| 166 |
+
the distribution grant:
|
| 167 |
+
|
| 168 |
+
1. Your application must have material additional
|
| 169 |
+
functionality, beyond the included portions of the SDK.
|
| 170 |
+
|
| 171 |
+
2. The distributable portions of the SDK shall only be
|
| 172 |
+
accessed by your application.
|
| 173 |
+
|
| 174 |
+
3. The following notice shall be included in modifications
|
| 175 |
+
and derivative works of sample source code distributed:
|
| 176 |
+
“This software contains source code provided by NVIDIA
|
| 177 |
+
Corporation.”
|
| 178 |
+
|
| 179 |
+
4. Unless a developer tool is identified in this Agreement
|
| 180 |
+
as distributable, it is delivered for your internal use
|
| 181 |
+
only.
|
| 182 |
+
|
| 183 |
+
5. The terms under which you distribute your application
|
| 184 |
+
must be consistent with the terms of this Agreement,
|
| 185 |
+
including (without limitation) terms relating to the
|
| 186 |
+
license grant and license restrictions and protection of
|
| 187 |
+
NVIDIA’s intellectual property rights. Additionally, you
|
| 188 |
+
agree that you will protect the privacy, security and
|
| 189 |
+
legal rights of your application users.
|
| 190 |
+
|
| 191 |
+
6. You agree to notify NVIDIA in writing of any known or
|
| 192 |
+
suspected distribution or use of the SDK not in compliance
|
| 193 |
+
with the requirements of this Agreement, and to enforce
|
| 194 |
+
the terms of your agreements with respect to distributed
|
| 195 |
+
SDK.
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
1.1.3. Authorized Users
|
| 199 |
+
|
| 200 |
+
You may allow employees and contractors of your entity or of
|
| 201 |
+
your subsidiary(ies) to access and use the SDK from your
|
| 202 |
+
secure network to perform work on your behalf.
|
| 203 |
+
|
| 204 |
+
If you are an academic institution you may allow users
|
| 205 |
+
enrolled or employed by the academic institution to access and
|
| 206 |
+
use the SDK from your secure network.
|
| 207 |
+
|
| 208 |
+
You are responsible for the compliance with the terms of this
|
| 209 |
+
Agreement by your authorized users. If you become aware that
|
| 210 |
+
your authorized users didn’t follow the terms of this
|
| 211 |
+
Agreement, you agree to take reasonable steps to resolve the
|
| 212 |
+
non-compliance and prevent new occurrences.
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
1.1.4. Pre-Release SDK
|
| 216 |
+
|
| 217 |
+
The SDK versions identified as alpha, beta, preview or
|
| 218 |
+
otherwise as pre-release, may not be fully functional, may
|
| 219 |
+
contain errors or design flaws, and may have reduced or
|
| 220 |
+
different security, privacy, accessibility, availability, and
|
| 221 |
+
reliability standards relative to commercial versions of
|
| 222 |
+
NVIDIA software and materials. Use of a pre-release SDK may
|
| 223 |
+
result in unexpected results, loss of data, project delays or
|
| 224 |
+
other unpredictable damage or loss.
|
| 225 |
+
|
| 226 |
+
You may use a pre-release SDK at your own risk, understanding
|
| 227 |
+
that pre-release SDKs are not intended for use in production
|
| 228 |
+
or business-critical systems.
|
| 229 |
+
|
| 230 |
+
NVIDIA may choose not to make available a commercial version
|
| 231 |
+
of any pre-release SDK. NVIDIA may also choose to abandon
|
| 232 |
+
development and terminate the availability of a pre-release
|
| 233 |
+
SDK at any time without liability.
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
1.1.5. Updates
|
| 237 |
+
|
| 238 |
+
NVIDIA may, at its option, make available patches, workarounds
|
| 239 |
+
or other updates to this SDK. Unless the updates are provided
|
| 240 |
+
with their separate governing terms, they are deemed part of
|
| 241 |
+
the SDK licensed to you as provided in this Agreement. You
|
| 242 |
+
agree that the form and content of the SDK that NVIDIA
|
| 243 |
+
provides may change without prior notice to you. While NVIDIA
|
| 244 |
+
generally maintains compatibility between versions, NVIDIA may
|
| 245 |
+
in some cases make changes that introduce incompatibilities in
|
| 246 |
+
future versions of the SDK.
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
1.1.6. Third Party Licenses
|
| 250 |
+
|
| 251 |
+
The SDK may come bundled with, or otherwise include or be
|
| 252 |
+
distributed with, third party software licensed by a NVIDIA
|
| 253 |
+
supplier and/or open source software provided under an open
|
| 254 |
+
source license. Use of third party software is subject to the
|
| 255 |
+
third-party license terms, or in the absence of third party
|
| 256 |
+
terms, the terms of this Agreement. Copyright to third party
|
| 257 |
+
software is held by the copyright holders indicated in the
|
| 258 |
+
third-party software or license.
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
1.1.7. Reservation of Rights
|
| 262 |
+
|
| 263 |
+
NVIDIA reserves all rights, title, and interest in and to the
|
| 264 |
+
SDK, not expressly granted to you under this Agreement.
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
1.2. Limitations
|
| 268 |
+
|
| 269 |
+
The following license limitations apply to your use of the
|
| 270 |
+
SDK:
|
| 271 |
+
|
| 272 |
+
1. You may not reverse engineer, decompile or disassemble,
|
| 273 |
+
or remove copyright or other proprietary notices from any
|
| 274 |
+
portion of the SDK or copies of the SDK.
|
| 275 |
+
|
| 276 |
+
2. Except as expressly provided in this Agreement, you may
|
| 277 |
+
not copy, sell, rent, sublicense, transfer, distribute,
|
| 278 |
+
modify, or create derivative works of any portion of the
|
| 279 |
+
SDK. For clarity, you may not distribute or sublicense the
|
| 280 |
+
SDK as a stand-alone product.
|
| 281 |
+
|
| 282 |
+
3. Unless you have an agreement with NVIDIA for this
|
| 283 |
+
purpose, you may not indicate that an application created
|
| 284 |
+
with the SDK is sponsored or endorsed by NVIDIA.
|
| 285 |
+
|
| 286 |
+
4. You may not bypass, disable, or circumvent any
|
| 287 |
+
encryption, security, digital rights management or
|
| 288 |
+
authentication mechanism in the SDK.
|
| 289 |
+
|
| 290 |
+
5. You may not use the SDK in any manner that would cause it
|
| 291 |
+
to become subject to an open source software license. As
|
| 292 |
+
examples, licenses that require as a condition of use,
|
| 293 |
+
modification, and/or distribution that the SDK be:
|
| 294 |
+
|
| 295 |
+
a. Disclosed or distributed in source code form;
|
| 296 |
+
|
| 297 |
+
b. Licensed for the purpose of making derivative works;
|
| 298 |
+
or
|
| 299 |
+
|
| 300 |
+
c. Redistributable at no charge.
|
| 301 |
+
|
| 302 |
+
6. Unless you have an agreement with NVIDIA for this
|
| 303 |
+
purpose, you may not use the SDK with any system or
|
| 304 |
+
application where the use or failure of the system or
|
| 305 |
+
application can reasonably be expected to threaten or
|
| 306 |
+
result in personal injury, death, or catastrophic loss.
|
| 307 |
+
Examples include use in avionics, navigation, military,
|
| 308 |
+
medical, life support or other life critical applications.
|
| 309 |
+
NVIDIA does not design, test or manufacture the SDK for
|
| 310 |
+
these critical uses and NVIDIA shall not be liable to you
|
| 311 |
+
or any third party, in whole or in part, for any claims or
|
| 312 |
+
damages arising from such uses.
|
| 313 |
+
|
| 314 |
+
7. You agree to defend, indemnify and hold harmless NVIDIA
|
| 315 |
+
and its affiliates, and their respective employees,
|
| 316 |
+
contractors, agents, officers and directors, from and
|
| 317 |
+
against any and all claims, damages, obligations, losses,
|
| 318 |
+
liabilities, costs or debt, fines, restitutions and
|
| 319 |
+
expenses (including but not limited to attorney’s fees
|
| 320 |
+
and costs incident to establishing the right of
|
| 321 |
+
indemnification) arising out of or related to your use of
|
| 322 |
+
the SDK outside of the scope of this Agreement, or not in
|
| 323 |
+
compliance with its terms.
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
1.3. Ownership
|
| 327 |
+
|
| 328 |
+
1. NVIDIA or its licensors hold all rights, title and
|
| 329 |
+
interest in and to the SDK and its modifications and
|
| 330 |
+
derivative works, including their respective intellectual
|
| 331 |
+
property rights, subject to your rights described in this
|
| 332 |
+
section. This SDK may include software and materials from
|
| 333 |
+
NVIDIA’s licensors, and these licensors are intended
|
| 334 |
+
third party beneficiaries that may enforce this Agreement
|
| 335 |
+
with respect to their intellectual property rights.
|
| 336 |
+
|
| 337 |
+
2. You hold all rights, title and interest in and to your
|
| 338 |
+
applications and your derivative works of the sample
|
| 339 |
+
source code delivered in the SDK, including their
|
| 340 |
+
respective intellectual property rights, subject to
|
| 341 |
+
NVIDIA’s rights described in this section.
|
| 342 |
+
|
| 343 |
+
3. You may, but don’t have to, provide to NVIDIA
|
| 344 |
+
suggestions, feature requests or other feedback regarding
|
| 345 |
+
the SDK, including possible enhancements or modifications
|
| 346 |
+
to the SDK. For any feedback that you voluntarily provide,
|
| 347 |
+
you hereby grant NVIDIA and its affiliates a perpetual,
|
| 348 |
+
non-exclusive, worldwide, irrevocable license to use,
|
| 349 |
+
reproduce, modify, license, sublicense (through multiple
|
| 350 |
+
tiers of sublicensees), and distribute (through multiple
|
| 351 |
+
tiers of distributors) it without the payment of any
|
| 352 |
+
royalties or fees to you. NVIDIA will use feedback at its
|
| 353 |
+
choice. NVIDIA is constantly looking for ways to improve
|
| 354 |
+
its products, so you may send feedback to NVIDIA through
|
| 355 |
+
the developer portal at https://developer.nvidia.com.
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
1.4. No Warranties
|
| 359 |
+
|
| 360 |
+
THE SDK IS PROVIDED BY NVIDIA “AS IS” AND “WITH ALL
|
| 361 |
+
FAULTS.” TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND
|
| 362 |
+
ITS AFFILIATES EXPRESSLY DISCLAIM ALL WARRANTIES OF ANY KIND
|
| 363 |
+
OR NATURE, WHETHER EXPRESS, IMPLIED OR STATUTORY, INCLUDING,
|
| 364 |
+
BUT NOT LIMITED TO, ANY WARRANTIES OF MERCHANTABILITY, FITNESS
|
| 365 |
+
FOR A PARTICULAR PURPOSE, TITLE, NON-INFRINGEMENT, OR THE
|
| 366 |
+
ABSENCE OF ANY DEFECTS THEREIN, WHETHER LATENT OR PATENT. NO
|
| 367 |
+
WARRANTY IS MADE ON THE BASIS OF TRADE USAGE, COURSE OF
|
| 368 |
+
DEALING OR COURSE OF TRADE.
|
| 369 |
+
|
| 370 |
+
|
| 371 |
+
1.5. Limitation of Liability
|
| 372 |
+
|
| 373 |
+
TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS
|
| 374 |
+
AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL,
|
| 375 |
+
PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, LOSS
|
| 376 |
+
OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF
|
| 377 |
+
PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION
|
| 378 |
+
WITH THIS AGREEMENT OR THE USE OR PERFORMANCE OF THE SDK,
|
| 379 |
+
WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH
|
| 380 |
+
OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE),
|
| 381 |
+
PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF
|
| 382 |
+
LIABILITY. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES
|
| 383 |
+
TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS
|
| 384 |
+
AGREEMENT EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE
|
| 385 |
+
NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS
|
| 386 |
+
LIMIT.
|
| 387 |
+
|
| 388 |
+
These exclusions and limitations of liability shall apply
|
| 389 |
+
regardless if NVIDIA or its affiliates have been advised of
|
| 390 |
+
the possibility of such damages, and regardless of whether a
|
| 391 |
+
remedy fails its essential purpose. These exclusions and
|
| 392 |
+
limitations of liability form an essential basis of the
|
| 393 |
+
bargain between the parties, and, absent any of these
|
| 394 |
+
exclusions or limitations of liability, the provisions of this
|
| 395 |
+
Agreement, including, without limitation, the economic terms,
|
| 396 |
+
would be substantially different.
|
| 397 |
+
|
| 398 |
+
|
| 399 |
+
1.6. Termination
|
| 400 |
+
|
| 401 |
+
1. This Agreement will continue to apply until terminated by
|
| 402 |
+
either you or NVIDIA as described below.
|
| 403 |
+
|
| 404 |
+
2. If you want to terminate this Agreement, you may do so by
|
| 405 |
+
stopping to use the SDK.
|
| 406 |
+
|
| 407 |
+
3. NVIDIA may, at any time, terminate this Agreement if:
|
| 408 |
+
|
| 409 |
+
a. (i) you fail to comply with any term of this
|
| 410 |
+
Agreement and the non-compliance is not fixed within
|
| 411 |
+
thirty (30) days following notice from NVIDIA (or
|
| 412 |
+
immediately if you violate NVIDIA’s intellectual
|
| 413 |
+
property rights);
|
| 414 |
+
|
| 415 |
+
b. (ii) you commence or participate in any legal
|
| 416 |
+
proceeding against NVIDIA with respect to the SDK; or
|
| 417 |
+
|
| 418 |
+
c. (iii) NVIDIA decides to no longer provide the SDK in
|
| 419 |
+
a country or, in NVIDIA’s sole discretion, the
|
| 420 |
+
continued use of it is no longer commercially viable.
|
| 421 |
+
|
| 422 |
+
4. Upon any termination of this Agreement, you agree to
|
| 423 |
+
promptly discontinue use of the SDK and destroy all copies
|
| 424 |
+
in your possession or control. Your prior distributions in
|
| 425 |
+
accordance with this Agreement are not affected by the
|
| 426 |
+
termination of this Agreement. Upon written request, you
|
| 427 |
+
will certify in writing that you have complied with your
|
| 428 |
+
commitments under this section. Upon any termination of
|
| 429 |
+
this Agreement all provisions survive except for the
|
| 430 |
+
license grant provisions.
|
| 431 |
+
|
| 432 |
+
|
| 433 |
+
1.7. General
|
| 434 |
+
|
| 435 |
+
If you wish to assign this Agreement or your rights and
|
| 436 |
+
obligations, including by merger, consolidation, dissolution
|
| 437 |
+
or operation of law, contact NVIDIA to ask for permission. Any
|
| 438 |
+
attempted assignment not approved by NVIDIA in writing shall
|
| 439 |
+
be void and of no effect. NVIDIA may assign, delegate or
|
| 440 |
+
transfer this Agreement and its rights and obligations, and if
|
| 441 |
+
to a non-affiliate you will be notified.
|
| 442 |
+
|
| 443 |
+
You agree to cooperate with NVIDIA and provide reasonably
|
| 444 |
+
requested information to verify your compliance with this
|
| 445 |
+
Agreement.
|
| 446 |
+
|
| 447 |
+
This Agreement will be governed in all respects by the laws of
|
| 448 |
+
the United States and of the State of Delaware as those laws
|
| 449 |
+
are applied to contracts entered into and performed entirely
|
| 450 |
+
within Delaware by Delaware residents, without regard to the
|
| 451 |
+
conflicts of laws principles. The United Nations Convention on
|
| 452 |
+
Contracts for the International Sale of Goods is specifically
|
| 453 |
+
disclaimed. You agree to all terms of this Agreement in the
|
| 454 |
+
English language.
|
| 455 |
+
|
| 456 |
+
The state or federal courts residing in Santa Clara County,
|
| 457 |
+
California shall have exclusive jurisdiction over any dispute
|
| 458 |
+
or claim arising out of this Agreement. Notwithstanding this,
|
| 459 |
+
you agree that NVIDIA shall still be allowed to apply for
|
| 460 |
+
injunctive remedies or an equivalent type of urgent legal
|
| 461 |
+
relief in any jurisdiction.
|
| 462 |
+
|
| 463 |
+
If any court of competent jurisdiction determines that any
|
| 464 |
+
provision of this Agreement is illegal, invalid or
|
| 465 |
+
unenforceable, such provision will be construed as limited to
|
| 466 |
+
the extent necessary to be consistent with and fully
|
| 467 |
+
enforceable under the law and the remaining provisions will
|
| 468 |
+
remain in full force and effect. Unless otherwise specified,
|
| 469 |
+
remedies are cumulative.
|
| 470 |
+
|
| 471 |
+
Each party acknowledges and agrees that the other is an
|
| 472 |
+
independent contractor in the performance of this Agreement.
|
| 473 |
+
|
| 474 |
+
The SDK has been developed entirely at private expense and is
|
| 475 |
+
“commercial items” consisting of “commercial computer
|
| 476 |
+
software” and “commercial computer software
|
| 477 |
+
documentation” provided with RESTRICTED RIGHTS. Use,
|
| 478 |
+
duplication or disclosure by the U.S. Government or a U.S.
|
| 479 |
+
Government subcontractor is subject to the restrictions in
|
| 480 |
+
this Agreement pursuant to DFARS 227.7202-3(a) or as set forth
|
| 481 |
+
in subparagraphs (c)(1) and (2) of the Commercial Computer
|
| 482 |
+
Software - Restricted Rights clause at FAR 52.227-19, as
|
| 483 |
+
applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas
|
| 484 |
+
Expressway, Santa Clara, CA 95051.
|
| 485 |
+
|
| 486 |
+
The SDK is subject to United States export laws and
|
| 487 |
+
regulations. You agree that you will not ship, transfer or
|
| 488 |
+
export the SDK into any country, or use the SDK in any manner,
|
| 489 |
+
prohibited by the United States Bureau of Industry and
|
| 490 |
+
Security or economic sanctions regulations administered by the
|
| 491 |
+
U.S. Department of Treasury’s Office of Foreign Assets
|
| 492 |
+
Control (OFAC), or any applicable export laws, restrictions or
|
| 493 |
+
regulations. These laws include restrictions on destinations,
|
| 494 |
+
end users and end use. By accepting this Agreement, you
|
| 495 |
+
confirm that you are not a resident or citizen of any country
|
| 496 |
+
currently embargoed by the U.S. and that you are not otherwise
|
| 497 |
+
prohibited from receiving the SDK.
|
| 498 |
+
|
| 499 |
+
Any notice delivered by NVIDIA to you under this Agreement
|
| 500 |
+
will be delivered via mail, email or fax. You agree that any
|
| 501 |
+
notices that NVIDIA sends you electronically will satisfy any
|
| 502 |
+
legal communication requirements. Please direct your legal
|
| 503 |
+
notices or other correspondence to NVIDIA Corporation, 2788
|
| 504 |
+
San Tomas Expressway, Santa Clara, California 95051, United
|
| 505 |
+
States of America, Attention: Legal Department.
|
| 506 |
+
|
| 507 |
+
This Agreement and any exhibits incorporated into this
|
| 508 |
+
Agreement constitute the entire agreement of the parties with
|
| 509 |
+
respect to the subject matter of this Agreement and supersede
|
| 510 |
+
all prior negotiations or documentation exchanged between the
|
| 511 |
+
parties relating to this SDK license. Any additional and/or
|
| 512 |
+
conflicting terms on documents issued by you are null, void,
|
| 513 |
+
and invalid. Any amendment or waiver under this Agreement
|
| 514 |
+
shall be in writing and signed by representatives of both
|
| 515 |
+
parties.
|
| 516 |
+
|
| 517 |
+
|
| 518 |
+
2. CUDA Toolkit Supplement to Software License Agreement for
|
| 519 |
+
NVIDIA Software Development Kits
|
| 520 |
+
------------------------------------------------------------
|
| 521 |
+
|
| 522 |
+
|
| 523 |
+
Release date: August 16, 2018
|
| 524 |
+
-----------------------------
|
| 525 |
+
|
| 526 |
+
The terms in this supplement govern your use of the NVIDIA
|
| 527 |
+
CUDA Toolkit SDK under the terms of your license agreement
|
| 528 |
+
(“Agreement”) as modified by this supplement. Capitalized
|
| 529 |
+
terms used but not defined below have the meaning assigned to
|
| 530 |
+
them in the Agreement.
|
| 531 |
+
|
| 532 |
+
This supplement is an exhibit to the Agreement and is
|
| 533 |
+
incorporated as an integral part of the Agreement. In the
|
| 534 |
+
event of conflict between the terms in this supplement and the
|
| 535 |
+
terms in the Agreement, the terms in this supplement govern.
|
| 536 |
+
|
| 537 |
+
|
| 538 |
+
2.1. License Scope
|
| 539 |
+
|
| 540 |
+
The SDK is licensed for you to develop applications only for
|
| 541 |
+
use in systems with NVIDIA GPUs.
|
| 542 |
+
|
| 543 |
+
|
| 544 |
+
2.2. Distribution
|
| 545 |
+
|
| 546 |
+
The portions of the SDK that are distributable under the
|
| 547 |
+
Agreement are listed in Attachment A.
|
| 548 |
+
|
| 549 |
+
|
| 550 |
+
2.3. Operating Systems
|
| 551 |
+
|
| 552 |
+
Those portions of the SDK designed exclusively for use on the
|
| 553 |
+
Linux or FreeBSD operating systems, or other operating systems
|
| 554 |
+
derived from the source code to these operating systems, may
|
| 555 |
+
be copied and redistributed for use in accordance with this
|
| 556 |
+
Agreement, provided that the object code files are not
|
| 557 |
+
modified in any way (except for unzipping of compressed
|
| 558 |
+
files).
|
| 559 |
+
|
| 560 |
+
|
| 561 |
+
2.4. Audio and Video Encoders and Decoders
|
| 562 |
+
|
| 563 |
+
You acknowledge and agree that it is your sole responsibility
|
| 564 |
+
to obtain any additional third-party licenses required to
|
| 565 |
+
make, have made, use, have used, sell, import, and offer for
|
| 566 |
+
sale your products or services that include or incorporate any
|
| 567 |
+
third-party software and content relating to audio and/or
|
| 568 |
+
video encoders and decoders from, including but not limited
|
| 569 |
+
to, Microsoft, Thomson, Fraunhofer IIS, Sisvel S.p.A.,
|
| 570 |
+
MPEG-LA, and Coding Technologies. NVIDIA does not grant to you
|
| 571 |
+
under this Agreement any necessary patent or other rights with
|
| 572 |
+
respect to any audio and/or video encoders and decoders.
|
| 573 |
+
|
| 574 |
+
|
| 575 |
+
2.5. Licensing
|
| 576 |
+
|
| 577 |
+
If the distribution terms in this Agreement are not suitable
|
| 578 |
+
for your organization, or for any questions regarding this
|
| 579 |
+
Agreement, please contact NVIDIA at
|
| 580 |
+
nvidia-compute-license-questions@nvidia.com.
|
| 581 |
+
|
| 582 |
+
|
| 583 |
+
2.6. Attachment A
|
| 584 |
+
|
| 585 |
+
The following portions of the SDK are distributable under the
|
| 586 |
+
Agreement:
|
| 587 |
+
|
| 588 |
+
Component
|
| 589 |
+
|
| 590 |
+
CUDA Runtime
|
| 591 |
+
|
| 592 |
+
Windows
|
| 593 |
+
|
| 594 |
+
cudart.dll, cudart_static.lib, cudadevrt.lib
|
| 595 |
+
|
| 596 |
+
Mac OSX
|
| 597 |
+
|
| 598 |
+
libcudart.dylib, libcudart_static.a, libcudadevrt.a
|
| 599 |
+
|
| 600 |
+
Linux
|
| 601 |
+
|
| 602 |
+
libcudart.so, libcudart_static.a, libcudadevrt.a
|
| 603 |
+
|
| 604 |
+
Android
|
| 605 |
+
|
| 606 |
+
libcudart.so, libcudart_static.a, libcudadevrt.a
|
| 607 |
+
|
| 608 |
+
Component
|
| 609 |
+
|
| 610 |
+
CUDA FFT Library
|
| 611 |
+
|
| 612 |
+
Windows
|
| 613 |
+
|
| 614 |
+
cufft.dll, cufftw.dll, cufft.lib, cufftw.lib
|
| 615 |
+
|
| 616 |
+
Mac OSX
|
| 617 |
+
|
| 618 |
+
libcufft.dylib, libcufft_static.a, libcufftw.dylib,
|
| 619 |
+
libcufftw_static.a
|
| 620 |
+
|
| 621 |
+
Linux
|
| 622 |
+
|
| 623 |
+
libcufft.so, libcufft_static.a, libcufftw.so,
|
| 624 |
+
libcufftw_static.a
|
| 625 |
+
|
| 626 |
+
Android
|
| 627 |
+
|
| 628 |
+
libcufft.so, libcufft_static.a, libcufftw.so,
|
| 629 |
+
libcufftw_static.a
|
| 630 |
+
|
| 631 |
+
Component
|
| 632 |
+
|
| 633 |
+
CUDA BLAS Library
|
| 634 |
+
|
| 635 |
+
Windows
|
| 636 |
+
|
| 637 |
+
cublas.dll, cublasLt.dll
|
| 638 |
+
|
| 639 |
+
Mac OSX
|
| 640 |
+
|
| 641 |
+
libcublas.dylib, libcublasLt.dylib, libcublas_static.a,
|
| 642 |
+
libcublasLt_static.a
|
| 643 |
+
|
| 644 |
+
Linux
|
| 645 |
+
|
| 646 |
+
libcublas.so, libcublasLt.so, libcublas_static.a,
|
| 647 |
+
libcublasLt_static.a
|
| 648 |
+
|
| 649 |
+
Android
|
| 650 |
+
|
| 651 |
+
libcublas.so, libcublasLt.so, libcublas_static.a,
|
| 652 |
+
libcublasLt_static.a
|
| 653 |
+
|
| 654 |
+
Component
|
| 655 |
+
|
| 656 |
+
NVIDIA "Drop-in" BLAS Library
|
| 657 |
+
|
| 658 |
+
Windows
|
| 659 |
+
|
| 660 |
+
nvblas.dll
|
| 661 |
+
|
| 662 |
+
Mac OSX
|
| 663 |
+
|
| 664 |
+
libnvblas.dylib
|
| 665 |
+
|
| 666 |
+
Linux
|
| 667 |
+
|
| 668 |
+
libnvblas.so
|
| 669 |
+
|
| 670 |
+
Component
|
| 671 |
+
|
| 672 |
+
CUDA Sparse Matrix Library
|
| 673 |
+
|
| 674 |
+
Windows
|
| 675 |
+
|
| 676 |
+
cusparse.dll, cusparse.lib
|
| 677 |
+
|
| 678 |
+
Mac OSX
|
| 679 |
+
|
| 680 |
+
libcusparse.dylib, libcusparse_static.a
|
| 681 |
+
|
| 682 |
+
Linux
|
| 683 |
+
|
| 684 |
+
libcusparse.so, libcusparse_static.a
|
| 685 |
+
|
| 686 |
+
Android
|
| 687 |
+
|
| 688 |
+
libcusparse.so, libcusparse_static.a
|
| 689 |
+
|
| 690 |
+
Component
|
| 691 |
+
|
| 692 |
+
CUDA Linear Solver Library
|
| 693 |
+
|
| 694 |
+
Windows
|
| 695 |
+
|
| 696 |
+
cusolver.dll, cusolver.lib
|
| 697 |
+
|
| 698 |
+
Mac OSX
|
| 699 |
+
|
| 700 |
+
libcusolver.dylib, libcusolver_static.a
|
| 701 |
+
|
| 702 |
+
Linux
|
| 703 |
+
|
| 704 |
+
libcusolver.so, libcusolver_static.a
|
| 705 |
+
|
| 706 |
+
Android
|
| 707 |
+
|
| 708 |
+
libcusolver.so, libcusolver_static.a
|
| 709 |
+
|
| 710 |
+
Component
|
| 711 |
+
|
| 712 |
+
CUDA Random Number Generation Library
|
| 713 |
+
|
| 714 |
+
Windows
|
| 715 |
+
|
| 716 |
+
curand.dll, curand.lib
|
| 717 |
+
|
| 718 |
+
Mac OSX
|
| 719 |
+
|
| 720 |
+
libcurand.dylib, libcurand_static.a
|
| 721 |
+
|
| 722 |
+
Linux
|
| 723 |
+
|
| 724 |
+
libcurand.so, libcurand_static.a
|
| 725 |
+
|
| 726 |
+
Android
|
| 727 |
+
|
| 728 |
+
libcurand.so, libcurand_static.a
|
| 729 |
+
|
| 730 |
+
Component
|
| 731 |
+
|
| 732 |
+
CUDA Accelerated Graph Library
|
| 733 |
+
|
| 734 |
+
Component
|
| 735 |
+
|
| 736 |
+
NVIDIA Performance Primitives Library
|
| 737 |
+
|
| 738 |
+
Windows
|
| 739 |
+
|
| 740 |
+
nppc.dll, nppc.lib, nppial.dll, nppial.lib, nppicc.dll,
|
| 741 |
+
nppicc.lib, nppicom.dll, nppicom.lib, nppidei.dll,
|
| 742 |
+
nppidei.lib, nppif.dll, nppif.lib, nppig.dll, nppig.lib,
|
| 743 |
+
nppim.dll, nppim.lib, nppist.dll, nppist.lib, nppisu.dll,
|
| 744 |
+
nppisu.lib, nppitc.dll, nppitc.lib, npps.dll, npps.lib
|
| 745 |
+
|
| 746 |
+
Mac OSX
|
| 747 |
+
|
| 748 |
+
libnppc.dylib, libnppc_static.a, libnppial.dylib,
|
| 749 |
+
libnppial_static.a, libnppicc.dylib, libnppicc_static.a,
|
| 750 |
+
libnppicom.dylib, libnppicom_static.a, libnppidei.dylib,
|
| 751 |
+
libnppidei_static.a, libnppif.dylib, libnppif_static.a,
|
| 752 |
+
libnppig.dylib, libnppig_static.a, libnppim.dylib,
|
| 753 |
+
libnppisu_static.a, libnppitc.dylib, libnppitc_static.a,
|
| 754 |
+
libnpps.dylib, libnpps_static.a
|
| 755 |
+
|
| 756 |
+
Linux
|
| 757 |
+
|
| 758 |
+
libnppc.so, libnppc_static.a, libnppial.so,
|
| 759 |
+
libnppial_static.a, libnppicc.so, libnppicc_static.a,
|
| 760 |
+
libnppicom.so, libnppicom_static.a, libnppidei.so,
|
| 761 |
+
libnppidei_static.a, libnppif.so, libnppif_static.a
|
| 762 |
+
libnppig.so, libnppig_static.a, libnppim.so,
|
| 763 |
+
libnppim_static.a, libnppist.so, libnppist_static.a,
|
| 764 |
+
libnppisu.so, libnppisu_static.a, libnppitc.so
|
| 765 |
+
libnppitc_static.a, libnpps.so, libnpps_static.a
|
| 766 |
+
|
| 767 |
+
Android
|
| 768 |
+
|
| 769 |
+
libnppc.so, libnppc_static.a, libnppial.so,
|
| 770 |
+
libnppial_static.a, libnppicc.so, libnppicc_static.a,
|
| 771 |
+
libnppicom.so, libnppicom_static.a, libnppidei.so,
|
| 772 |
+
libnppidei_static.a, libnppif.so, libnppif_static.a
|
| 773 |
+
libnppig.so, libnppig_static.a, libnppim.so,
|
| 774 |
+
libnppim_static.a, libnppist.so, libnppist_static.a,
|
| 775 |
+
libnppisu.so, libnppisu_static.a, libnppitc.so
|
| 776 |
+
libnppitc_static.a, libnpps.so, libnpps_static.a
|
| 777 |
+
|
| 778 |
+
Component
|
| 779 |
+
|
| 780 |
+
NVIDIA JPEG Library
|
| 781 |
+
|
| 782 |
+
Linux
|
| 783 |
+
|
| 784 |
+
libnvjpeg.so, libnvjpeg_static.a
|
| 785 |
+
|
| 786 |
+
Component
|
| 787 |
+
|
| 788 |
+
Internal common library required for statically linking to
|
| 789 |
+
cuBLAS, cuSPARSE, cuFFT, cuRAND, nvJPEG and NPP
|
| 790 |
+
|
| 791 |
+
Mac OSX
|
| 792 |
+
|
| 793 |
+
libculibos.a
|
| 794 |
+
|
| 795 |
+
Linux
|
| 796 |
+
|
| 797 |
+
libculibos.a
|
| 798 |
+
|
| 799 |
+
Component
|
| 800 |
+
|
| 801 |
+
NVIDIA Runtime Compilation Library and Header
|
| 802 |
+
|
| 803 |
+
All
|
| 804 |
+
|
| 805 |
+
nvrtc.h
|
| 806 |
+
|
| 807 |
+
Windows
|
| 808 |
+
|
| 809 |
+
nvrtc.dll, nvrtc-builtins.dll
|
| 810 |
+
|
| 811 |
+
Mac OSX
|
| 812 |
+
|
| 813 |
+
libnvrtc.dylib, libnvrtc-builtins.dylib
|
| 814 |
+
|
| 815 |
+
Linux
|
| 816 |
+
|
| 817 |
+
libnvrtc.so, libnvrtc-builtins.so
|
| 818 |
+
|
| 819 |
+
Component
|
| 820 |
+
|
| 821 |
+
NVIDIA Optimizing Compiler Library
|
| 822 |
+
|
| 823 |
+
Windows
|
| 824 |
+
|
| 825 |
+
nvvm.dll
|
| 826 |
+
|
| 827 |
+
Mac OSX
|
| 828 |
+
|
| 829 |
+
libnvvm.dylib
|
| 830 |
+
|
| 831 |
+
Linux
|
| 832 |
+
|
| 833 |
+
libnvvm.so
|
| 834 |
+
|
| 835 |
+
Component
|
| 836 |
+
|
| 837 |
+
NVIDIA Common Device Math Functions Library
|
| 838 |
+
|
| 839 |
+
Windows
|
| 840 |
+
|
| 841 |
+
libdevice.10.bc
|
| 842 |
+
|
| 843 |
+
Mac OSX
|
| 844 |
+
|
| 845 |
+
libdevice.10.bc
|
| 846 |
+
|
| 847 |
+
Linux
|
| 848 |
+
|
| 849 |
+
libdevice.10.bc
|
| 850 |
+
|
| 851 |
+
Component
|
| 852 |
+
|
| 853 |
+
CUDA Occupancy Calculation Header Library
|
| 854 |
+
|
| 855 |
+
All
|
| 856 |
+
|
| 857 |
+
cuda_occupancy.h
|
| 858 |
+
|
| 859 |
+
Component
|
| 860 |
+
|
| 861 |
+
CUDA Half Precision Headers
|
| 862 |
+
|
| 863 |
+
All
|
| 864 |
+
|
| 865 |
+
cuda_fp16.h, cuda_fp16.hpp
|
| 866 |
+
|
| 867 |
+
Component
|
| 868 |
+
|
| 869 |
+
CUDA Profiling Tools Interface (CUPTI) Library
|
| 870 |
+
|
| 871 |
+
Windows
|
| 872 |
+
|
| 873 |
+
cupti.dll
|
| 874 |
+
|
| 875 |
+
Mac OSX
|
| 876 |
+
|
| 877 |
+
libcupti.dylib
|
| 878 |
+
|
| 879 |
+
Linux
|
| 880 |
+
|
| 881 |
+
libcupti.so
|
| 882 |
+
|
| 883 |
+
Component
|
| 884 |
+
|
| 885 |
+
NVIDIA Tools Extension Library
|
| 886 |
+
|
| 887 |
+
Windows
|
| 888 |
+
|
| 889 |
+
nvToolsExt.dll, nvToolsExt.lib
|
| 890 |
+
|
| 891 |
+
Mac OSX
|
| 892 |
+
|
| 893 |
+
libnvToolsExt.dylib
|
| 894 |
+
|
| 895 |
+
Linux
|
| 896 |
+
|
| 897 |
+
libnvToolsExt.so
|
| 898 |
+
|
| 899 |
+
Component
|
| 900 |
+
|
| 901 |
+
NVIDIA CUDA Driver Libraries
|
| 902 |
+
|
| 903 |
+
Linux
|
| 904 |
+
|
| 905 |
+
libcuda.so, libnvidia-fatbinaryloader.so,
|
| 906 |
+
libnvidia-ptxjitcompiler.so
|
| 907 |
+
|
| 908 |
+
The NVIDIA CUDA Driver Libraries are only distributable in
|
| 909 |
+
applications that meet this criteria:
|
| 910 |
+
|
| 911 |
+
1. The application was developed starting from a NVIDIA CUDA
|
| 912 |
+
container obtained from Docker Hub or the NVIDIA GPU
|
| 913 |
+
Cloud, and
|
| 914 |
+
|
| 915 |
+
2. The resulting application is packaged as a Docker
|
| 916 |
+
container and distributed to users on Docker Hub or the
|
| 917 |
+
NVIDIA GPU Cloud only.
|
| 918 |
+
|
| 919 |
+
|
| 920 |
+
2.7. Attachment B
|
| 921 |
+
|
| 922 |
+
|
| 923 |
+
Additional Licensing Obligations
|
| 924 |
+
|
| 925 |
+
The following third party components included in the SOFTWARE
|
| 926 |
+
are licensed to Licensee pursuant to the following terms and
|
| 927 |
+
conditions:
|
| 928 |
+
|
| 929 |
+
1. Licensee's use of the GDB third party component is
|
| 930 |
+
subject to the terms and conditions of GNU GPL v3:
|
| 931 |
+
|
| 932 |
+
This product includes copyrighted third-party software licensed
|
| 933 |
+
under the terms of the GNU General Public License v3 ("GPL v3").
|
| 934 |
+
All third-party software packages are copyright by their respective
|
| 935 |
+
authors. GPL v3 terms and conditions are hereby incorporated into
|
| 936 |
+
the Agreement by this reference: http://www.gnu.org/licenses/gpl.txt
|
| 937 |
+
|
| 938 |
+
Consistent with these licensing requirements, the software
|
| 939 |
+
listed below is provided under the terms of the specified
|
| 940 |
+
open source software licenses. To obtain source code for
|
| 941 |
+
software provided under licenses that require
|
| 942 |
+
redistribution of source code, including the GNU General
|
| 943 |
+
Public License (GPL) and GNU Lesser General Public License
|
| 944 |
+
(LGPL), contact oss-requests@nvidia.com. This offer is
|
| 945 |
+
valid for a period of three (3) years from the date of the
|
| 946 |
+
distribution of this product by NVIDIA CORPORATION.
|
| 947 |
+
|
| 948 |
+
Component License
|
| 949 |
+
CUDA-GDB GPL v3
|
| 950 |
+
|
| 951 |
+
2. Licensee represents and warrants that any and all third
|
| 952 |
+
party licensing and/or royalty payment obligations in
|
| 953 |
+
connection with Licensee's use of the H.264 video codecs
|
| 954 |
+
are solely the responsibility of Licensee.
|
| 955 |
+
|
| 956 |
+
3. Licensee's use of the Thrust library is subject to the
|
| 957 |
+
terms and conditions of the Apache License Version 2.0.
|
| 958 |
+
All third-party software packages are copyright by their
|
| 959 |
+
respective authors. Apache License Version 2.0 terms and
|
| 960 |
+
conditions are hereby incorporated into the Agreement by
|
| 961 |
+
this reference.
|
| 962 |
+
http://www.apache.org/licenses/LICENSE-2.0.html
|
| 963 |
+
|
| 964 |
+
In addition, Licensee acknowledges the following notice:
|
| 965 |
+
Thrust includes source code from the Boost Iterator,
|
| 966 |
+
Tuple, System, and Random Number libraries.
|
| 967 |
+
|
| 968 |
+
Boost Software License - Version 1.0 - August 17th, 2003
|
| 969 |
+
. . . .
|
| 970 |
+
|
| 971 |
+
Permission is hereby granted, free of charge, to any person or
|
| 972 |
+
organization obtaining a copy of the software and accompanying
|
| 973 |
+
documentation covered by this license (the "Software") to use,
|
| 974 |
+
reproduce, display, distribute, execute, and transmit the Software,
|
| 975 |
+
and to prepare derivative works of the Software, and to permit
|
| 976 |
+
third-parties to whom the Software is furnished to do so, all
|
| 977 |
+
subject to the following:
|
| 978 |
+
|
| 979 |
+
The copyright notices in the Software and this entire statement,
|
| 980 |
+
including the above license grant, this restriction and the following
|
| 981 |
+
disclaimer, must be included in all copies of the Software, in whole
|
| 982 |
+
or in part, and all derivative works of the Software, unless such
|
| 983 |
+
copies or derivative works are solely in the form of machine-executable
|
| 984 |
+
object code generated by a source language processor.
|
| 985 |
+
|
| 986 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
| 987 |
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
| 988 |
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE AND
|
| 989 |
+
NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
|
| 990 |
+
ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE FOR ANY DAMAGES OR
|
| 991 |
+
OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, ARISING
|
| 992 |
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
| 993 |
+
OTHER DEALINGS IN THE SOFTWARE.
|
| 994 |
+
|
| 995 |
+
4. Licensee's use of the LLVM third party component is
|
| 996 |
+
subject to the following terms and conditions:
|
| 997 |
+
|
| 998 |
+
======================================================
|
| 999 |
+
LLVM Release License
|
| 1000 |
+
======================================================
|
| 1001 |
+
University of Illinois/NCSA
|
| 1002 |
+
Open Source License
|
| 1003 |
+
|
| 1004 |
+
Copyright (c) 2003-2010 University of Illinois at Urbana-Champaign.
|
| 1005 |
+
All rights reserved.
|
| 1006 |
+
|
| 1007 |
+
Developed by:
|
| 1008 |
+
|
| 1009 |
+
LLVM Team
|
| 1010 |
+
|
| 1011 |
+
University of Illinois at Urbana-Champaign
|
| 1012 |
+
|
| 1013 |
+
http://llvm.org
|
| 1014 |
+
|
| 1015 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 1016 |
+
of this software and associated documentation files (the "Software"), to
|
| 1017 |
+
deal with the Software without restriction, including without limitation the
|
| 1018 |
+
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
| 1019 |
+
sell copies of the Software, and to permit persons to whom the Software is
|
| 1020 |
+
furnished to do so, subject to the following conditions:
|
| 1021 |
+
|
| 1022 |
+
* Redistributions of source code must retain the above copyright notice,
|
| 1023 |
+
this list of conditions and the following disclaimers.
|
| 1024 |
+
|
| 1025 |
+
* Redistributions in binary form must reproduce the above copyright
|
| 1026 |
+
notice, this list of conditions and the following disclaimers in the
|
| 1027 |
+
documentation and/or other materials provided with the distribution.
|
| 1028 |
+
|
| 1029 |
+
* Neither the names of the LLVM Team, University of Illinois at Urbana-
|
| 1030 |
+
Champaign, nor the names of its contributors may be used to endorse or
|
| 1031 |
+
promote products derived from this Software without specific prior
|
| 1032 |
+
written permission.
|
| 1033 |
+
|
| 1034 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 1035 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 1036 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
| 1037 |
+
THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
| 1038 |
+
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
| 1039 |
+
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
| 1040 |
+
DEALINGS WITH THE SOFTWARE.
|
| 1041 |
+
|
| 1042 |
+
5. Licensee's use (e.g. nvprof) of the PCRE third party
|
| 1043 |
+
component is subject to the following terms and
|
| 1044 |
+
conditions:
|
| 1045 |
+
|
| 1046 |
+
------------
|
| 1047 |
+
PCRE LICENCE
|
| 1048 |
+
------------
|
| 1049 |
+
PCRE is a library of functions to support regular expressions whose syntax
|
| 1050 |
+
and semantics are as close as possible to those of the Perl 5 language.
|
| 1051 |
+
Release 8 of PCRE is distributed under the terms of the "BSD" licence, as
|
| 1052 |
+
specified below. The documentation for PCRE, supplied in the "doc"
|
| 1053 |
+
directory, is distributed under the same terms as the software itself. The
|
| 1054 |
+
basic library functions are written in C and are freestanding. Also
|
| 1055 |
+
included in the distribution is a set of C++ wrapper functions, and a just-
|
| 1056 |
+
in-time compiler that can be used to optimize pattern matching. These are
|
| 1057 |
+
both optional features that can be omitted when the library is built.
|
| 1058 |
+
|
| 1059 |
+
THE BASIC LIBRARY FUNCTIONS
|
| 1060 |
+
---------------------------
|
| 1061 |
+
Written by: Philip Hazel
|
| 1062 |
+
Email local part: ph10
|
| 1063 |
+
Email domain: cam.ac.uk
|
| 1064 |
+
University of Cambridge Computing Service,
|
| 1065 |
+
Cambridge, England.
|
| 1066 |
+
Copyright (c) 1997-2012 University of Cambridge
|
| 1067 |
+
All rights reserved.
|
| 1068 |
+
|
| 1069 |
+
PCRE JUST-IN-TIME COMPILATION SUPPORT
|
| 1070 |
+
-------------------------------------
|
| 1071 |
+
Written by: Zoltan Herczeg
|
| 1072 |
+
Email local part: hzmester
|
| 1073 |
+
Emain domain: freemail.hu
|
| 1074 |
+
Copyright(c) 2010-2012 Zoltan Herczeg
|
| 1075 |
+
All rights reserved.
|
| 1076 |
+
|
| 1077 |
+
STACK-LESS JUST-IN-TIME COMPILER
|
| 1078 |
+
--------------------------------
|
| 1079 |
+
Written by: Zoltan Herczeg
|
| 1080 |
+
Email local part: hzmester
|
| 1081 |
+
Emain domain: freemail.hu
|
| 1082 |
+
Copyright(c) 2009-2012 Zoltan Herczeg
|
| 1083 |
+
All rights reserved.
|
| 1084 |
+
|
| 1085 |
+
THE C++ WRAPPER FUNCTIONS
|
| 1086 |
+
-------------------------
|
| 1087 |
+
Contributed by: Google Inc.
|
| 1088 |
+
Copyright (c) 2007-2012, Google Inc.
|
| 1089 |
+
All rights reserved.
|
| 1090 |
+
|
| 1091 |
+
THE "BSD" LICENCE
|
| 1092 |
+
-----------------
|
| 1093 |
+
Redistribution and use in source and binary forms, with or without
|
| 1094 |
+
modification, are permitted provided that the following conditions are met:
|
| 1095 |
+
|
| 1096 |
+
* Redistributions of source code must retain the above copyright notice,
|
| 1097 |
+
this list of conditions and the following disclaimer.
|
| 1098 |
+
|
| 1099 |
+
* Redistributions in binary form must reproduce the above copyright
|
| 1100 |
+
notice, this list of conditions and the following disclaimer in the
|
| 1101 |
+
documentation and/or other materials provided with the distribution.
|
| 1102 |
+
|
| 1103 |
+
* Neither the name of the University of Cambridge nor the name of Google
|
| 1104 |
+
Inc. nor the names of their contributors may be used to endorse or
|
| 1105 |
+
promote products derived from this software without specific prior
|
| 1106 |
+
written permission.
|
| 1107 |
+
|
| 1108 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
| 1109 |
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
| 1110 |
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
| 1111 |
+
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
| 1112 |
+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
| 1113 |
+
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
| 1114 |
+
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
| 1115 |
+
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
| 1116 |
+
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
| 1117 |
+
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
| 1118 |
+
POSSIBILITY OF SUCH DAMAGE.
|
| 1119 |
+
|
| 1120 |
+
6. Some of the cuBLAS library routines were written by or
|
| 1121 |
+
derived from code written by Vasily Volkov and are subject
|
| 1122 |
+
to the Modified Berkeley Software Distribution License as
|
| 1123 |
+
follows:
|
| 1124 |
+
|
| 1125 |
+
Copyright (c) 2007-2009, Regents of the University of California
|
| 1126 |
+
|
| 1127 |
+
All rights reserved.
|
| 1128 |
+
|
| 1129 |
+
Redistribution and use in source and binary forms, with or without
|
| 1130 |
+
modification, are permitted provided that the following conditions are
|
| 1131 |
+
met:
|
| 1132 |
+
* Redistributions of source code must retain the above copyright
|
| 1133 |
+
notice, this list of conditions and the following disclaimer.
|
| 1134 |
+
* Redistributions in binary form must reproduce the above
|
| 1135 |
+
copyright notice, this list of conditions and the following
|
| 1136 |
+
disclaimer in the documentation and/or other materials provided
|
| 1137 |
+
with the distribution.
|
| 1138 |
+
* Neither the name of the University of California, Berkeley nor
|
| 1139 |
+
the names of its contributors may be used to endorse or promote
|
| 1140 |
+
products derived from this software without specific prior
|
| 1141 |
+
written permission.
|
| 1142 |
+
|
| 1143 |
+
THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR
|
| 1144 |
+
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
| 1145 |
+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
| 1146 |
+
DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
|
| 1147 |
+
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
| 1148 |
+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
| 1149 |
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
| 1150 |
+
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
| 1151 |
+
STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
|
| 1152 |
+
IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
| 1153 |
+
POSSIBILITY OF SUCH DAMAGE.
|
| 1154 |
+
|
| 1155 |
+
7. Some of the cuBLAS library routines were written by or
|
| 1156 |
+
derived from code written by Davide Barbieri and are
|
| 1157 |
+
subject to the Modified Berkeley Software Distribution
|
| 1158 |
+
License as follows:
|
| 1159 |
+
|
| 1160 |
+
Copyright (c) 2008-2009 Davide Barbieri @ University of Rome Tor Vergata.
|
| 1161 |
+
|
| 1162 |
+
All rights reserved.
|
| 1163 |
+
|
| 1164 |
+
Redistribution and use in source and binary forms, with or without
|
| 1165 |
+
modification, are permitted provided that the following conditions are
|
| 1166 |
+
met:
|
| 1167 |
+
* Redistributions of source code must retain the above copyright
|
| 1168 |
+
notice, this list of conditions and the following disclaimer.
|
| 1169 |
+
* Redistributions in binary form must reproduce the above
|
| 1170 |
+
copyright notice, this list of conditions and the following
|
| 1171 |
+
disclaimer in the documentation and/or other materials provided
|
| 1172 |
+
with the distribution.
|
| 1173 |
+
* The name of the author may not be used to endorse or promote
|
| 1174 |
+
products derived from this software without specific prior
|
| 1175 |
+
written permission.
|
| 1176 |
+
|
| 1177 |
+
THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR
|
| 1178 |
+
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
| 1179 |
+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
| 1180 |
+
DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
|
| 1181 |
+
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
| 1182 |
+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
| 1183 |
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
| 1184 |
+
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
| 1185 |
+
STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
|
| 1186 |
+
IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
| 1187 |
+
POSSIBILITY OF SUCH DAMAGE.
|
| 1188 |
+
|
| 1189 |
+
8. Some of the cuBLAS library routines were derived from
|
| 1190 |
+
code developed by the University of Tennessee and are
|
| 1191 |
+
subject to the Modified Berkeley Software Distribution
|
| 1192 |
+
License as follows:
|
| 1193 |
+
|
| 1194 |
+
Copyright (c) 2010 The University of Tennessee.
|
| 1195 |
+
|
| 1196 |
+
All rights reserved.
|
| 1197 |
+
|
| 1198 |
+
Redistribution and use in source and binary forms, with or without
|
| 1199 |
+
modification, are permitted provided that the following conditions are
|
| 1200 |
+
met:
|
| 1201 |
+
* Redistributions of source code must retain the above copyright
|
| 1202 |
+
notice, this list of conditions and the following disclaimer.
|
| 1203 |
+
* Redistributions in binary form must reproduce the above
|
| 1204 |
+
copyright notice, this list of conditions and the following
|
| 1205 |
+
disclaimer listed in this license in the documentation and/or
|
| 1206 |
+
other materials provided with the distribution.
|
| 1207 |
+
* Neither the name of the copyright holders nor the names of its
|
| 1208 |
+
contributors may be used to endorse or promote products derived
|
| 1209 |
+
from this software without specific prior written permission.
|
| 1210 |
+
|
| 1211 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
| 1212 |
+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
| 1213 |
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
| 1214 |
+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
| 1215 |
+
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
| 1216 |
+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
| 1217 |
+
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
| 1218 |
+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
| 1219 |
+
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
| 1220 |
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| 1221 |
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| 1222 |
+
|
| 1223 |
+
9. Some of the cuBLAS library routines were written by or
|
| 1224 |
+
derived from code written by Jonathan Hogg and are subject
|
| 1225 |
+
to the Modified Berkeley Software Distribution License as
|
| 1226 |
+
follows:
|
| 1227 |
+
|
| 1228 |
+
Copyright (c) 2012, The Science and Technology Facilities Council (STFC).
|
| 1229 |
+
|
| 1230 |
+
All rights reserved.
|
| 1231 |
+
|
| 1232 |
+
Redistribution and use in source and binary forms, with or without
|
| 1233 |
+
modification, are permitted provided that the following conditions are
|
| 1234 |
+
met:
|
| 1235 |
+
* Redistributions of source code must retain the above copyright
|
| 1236 |
+
notice, this list of conditions and the following disclaimer.
|
| 1237 |
+
* Redistributions in binary form must reproduce the above
|
| 1238 |
+
copyright notice, this list of conditions and the following
|
| 1239 |
+
disclaimer in the documentation and/or other materials provided
|
| 1240 |
+
with the distribution.
|
| 1241 |
+
* Neither the name of the STFC nor the names of its contributors
|
| 1242 |
+
may be used to endorse or promote products derived from this
|
| 1243 |
+
software without specific prior written permission.
|
| 1244 |
+
|
| 1245 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
| 1246 |
+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
| 1247 |
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
| 1248 |
+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE STFC BE
|
| 1249 |
+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
| 1250 |
+
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
| 1251 |
+
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
| 1252 |
+
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
| 1253 |
+
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
| 1254 |
+
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
|
| 1255 |
+
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| 1256 |
+
|
| 1257 |
+
10. Some of the cuBLAS library routines were written by or
|
| 1258 |
+
derived from code written by Ahmad M. Abdelfattah, David
|
| 1259 |
+
Keyes, and Hatem Ltaief, and are subject to the Apache
|
| 1260 |
+
License, Version 2.0, as follows:
|
| 1261 |
+
|
| 1262 |
+
-- (C) Copyright 2013 King Abdullah University of Science and Technology
|
| 1263 |
+
Authors:
|
| 1264 |
+
Ahmad Abdelfattah (ahmad.ahmad@kaust.edu.sa)
|
| 1265 |
+
David Keyes (david.keyes@kaust.edu.sa)
|
| 1266 |
+
Hatem Ltaief (hatem.ltaief@kaust.edu.sa)
|
| 1267 |
+
|
| 1268 |
+
Redistribution and use in source and binary forms, with or without
|
| 1269 |
+
modification, are permitted provided that the following conditions
|
| 1270 |
+
are met:
|
| 1271 |
+
|
| 1272 |
+
* Redistributions of source code must retain the above copyright
|
| 1273 |
+
notice, this list of conditions and the following disclaimer.
|
| 1274 |
+
* Redistributions in binary form must reproduce the above copyright
|
| 1275 |
+
notice, this list of conditions and the following disclaimer in the
|
| 1276 |
+
documentation and/or other materials provided with the distribution.
|
| 1277 |
+
* Neither the name of the King Abdullah University of Science and
|
| 1278 |
+
Technology nor the names of its contributors may be used to endorse
|
| 1279 |
+
or promote products derived from this software without specific prior
|
| 1280 |
+
written permission.
|
| 1281 |
+
|
| 1282 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
| 1283 |
+
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
| 1284 |
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
| 1285 |
+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
| 1286 |
+
HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
| 1287 |
+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
| 1288 |
+
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
| 1289 |
+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
| 1290 |
+
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
| 1291 |
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| 1292 |
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE
|
| 1293 |
+
|
| 1294 |
+
11. Some of the cuSPARSE library routines were written by or
|
| 1295 |
+
derived from code written by Li-Wen Chang and are subject
|
| 1296 |
+
to the NCSA Open Source License as follows:
|
| 1297 |
+
|
| 1298 |
+
Copyright (c) 2012, University of Illinois.
|
| 1299 |
+
|
| 1300 |
+
All rights reserved.
|
| 1301 |
+
|
| 1302 |
+
Developed by: IMPACT Group, University of Illinois, http://impact.crhc.illinois.edu
|
| 1303 |
+
|
| 1304 |
+
Permission is hereby granted, free of charge, to any person obtaining
|
| 1305 |
+
a copy of this software and associated documentation files (the
|
| 1306 |
+
"Software"), to deal with the Software without restriction, including
|
| 1307 |
+
without limitation the rights to use, copy, modify, merge, publish,
|
| 1308 |
+
distribute, sublicense, and/or sell copies of the Software, and to
|
| 1309 |
+
permit persons to whom the Software is furnished to do so, subject to
|
| 1310 |
+
the following conditions:
|
| 1311 |
+
* Redistributions of source code must retain the above copyright
|
| 1312 |
+
notice, this list of conditions and the following disclaimer.
|
| 1313 |
+
* Redistributions in binary form must reproduce the above
|
| 1314 |
+
copyright notice, this list of conditions and the following
|
| 1315 |
+
disclaimers in the documentation and/or other materials provided
|
| 1316 |
+
with the distribution.
|
| 1317 |
+
* Neither the names of IMPACT Group, University of Illinois, nor
|
| 1318 |
+
the names of its contributors may be used to endorse or promote
|
| 1319 |
+
products derived from this Software without specific prior
|
| 1320 |
+
written permission.
|
| 1321 |
+
|
| 1322 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
| 1323 |
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
| 1324 |
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
| 1325 |
+
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
| 1326 |
+
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
| 1327 |
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
|
| 1328 |
+
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
| 1329 |
+
SOFTWARE.
|
| 1330 |
+
|
| 1331 |
+
12. Some of the cuRAND library routines were written by or
|
| 1332 |
+
derived from code written by Mutsuo Saito and Makoto
|
| 1333 |
+
Matsumoto and are subject to the following license:
|
| 1334 |
+
|
| 1335 |
+
Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima
|
| 1336 |
+
University. All rights reserved.
|
| 1337 |
+
|
| 1338 |
+
Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima
|
| 1339 |
+
University and University of Tokyo. All rights reserved.
|
| 1340 |
+
|
| 1341 |
+
Redistribution and use in source and binary forms, with or without
|
| 1342 |
+
modification, are permitted provided that the following conditions are
|
| 1343 |
+
met:
|
| 1344 |
+
* Redistributions of source code must retain the above copyright
|
| 1345 |
+
notice, this list of conditions and the following disclaimer.
|
| 1346 |
+
* Redistributions in binary form must reproduce the above
|
| 1347 |
+
copyright notice, this list of conditions and the following
|
| 1348 |
+
disclaimer in the documentation and/or other materials provided
|
| 1349 |
+
with the distribution.
|
| 1350 |
+
* Neither the name of the Hiroshima University nor the names of
|
| 1351 |
+
its contributors may be used to endorse or promote products
|
| 1352 |
+
derived from this software without specific prior written
|
| 1353 |
+
permission.
|
| 1354 |
+
|
| 1355 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
| 1356 |
+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
| 1357 |
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
| 1358 |
+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
| 1359 |
+
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
| 1360 |
+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
| 1361 |
+
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
| 1362 |
+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
| 1363 |
+
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
| 1364 |
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| 1365 |
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| 1366 |
+
|
| 1367 |
+
13. Some of the cuRAND library routines were derived from
|
| 1368 |
+
code developed by D. E. Shaw Research and are subject to
|
| 1369 |
+
the following license:
|
| 1370 |
+
|
| 1371 |
+
Copyright 2010-2011, D. E. Shaw Research.
|
| 1372 |
+
|
| 1373 |
+
All rights reserved.
|
| 1374 |
+
|
| 1375 |
+
Redistribution and use in source and binary forms, with or without
|
| 1376 |
+
modification, are permitted provided that the following conditions are
|
| 1377 |
+
met:
|
| 1378 |
+
* Redistributions of source code must retain the above copyright
|
| 1379 |
+
notice, this list of conditions, and the following disclaimer.
|
| 1380 |
+
* Redistributions in binary form must reproduce the above
|
| 1381 |
+
copyright notice, this list of conditions, and the following
|
| 1382 |
+
disclaimer in the documentation and/or other materials provided
|
| 1383 |
+
with the distribution.
|
| 1384 |
+
* Neither the name of D. E. Shaw Research nor the names of its
|
| 1385 |
+
contributors may be used to endorse or promote products derived
|
| 1386 |
+
from this software without specific prior written permission.
|
| 1387 |
+
|
| 1388 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
| 1389 |
+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
| 1390 |
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
| 1391 |
+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
| 1392 |
+
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
| 1393 |
+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
| 1394 |
+
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
| 1395 |
+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
| 1396 |
+
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
| 1397 |
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| 1398 |
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| 1399 |
+
|
| 1400 |
+
14. Some of the Math library routines were written by or
|
| 1401 |
+
derived from code developed by Norbert Juffa and are
|
| 1402 |
+
subject to the following license:
|
| 1403 |
+
|
| 1404 |
+
Copyright (c) 2015-2017, Norbert Juffa
|
| 1405 |
+
All rights reserved.
|
| 1406 |
+
|
| 1407 |
+
Redistribution and use in source and binary forms, with or without
|
| 1408 |
+
modification, are permitted provided that the following conditions
|
| 1409 |
+
are met:
|
| 1410 |
+
|
| 1411 |
+
1. Redistributions of source code must retain the above copyright
|
| 1412 |
+
notice, this list of conditions and the following disclaimer.
|
| 1413 |
+
|
| 1414 |
+
2. Redistributions in binary form must reproduce the above copyright
|
| 1415 |
+
notice, this list of conditions and the following disclaimer in the
|
| 1416 |
+
documentation and/or other materials provided with the distribution.
|
| 1417 |
+
|
| 1418 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
| 1419 |
+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
| 1420 |
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
| 1421 |
+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
| 1422 |
+
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
| 1423 |
+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
| 1424 |
+
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
| 1425 |
+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
| 1426 |
+
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
| 1427 |
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| 1428 |
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| 1429 |
+
|
| 1430 |
+
15. Licensee's use of the lz4 third party component is
|
| 1431 |
+
subject to the following terms and conditions:
|
| 1432 |
+
|
| 1433 |
+
Copyright (C) 2011-2013, Yann Collet.
|
| 1434 |
+
BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
|
| 1435 |
+
|
| 1436 |
+
Redistribution and use in source and binary forms, with or without
|
| 1437 |
+
modification, are permitted provided that the following conditions are
|
| 1438 |
+
met:
|
| 1439 |
+
|
| 1440 |
+
* Redistributions of source code must retain the above copyright
|
| 1441 |
+
notice, this list of conditions and the following disclaimer.
|
| 1442 |
+
* Redistributions in binary form must reproduce the above
|
| 1443 |
+
copyright notice, this list of conditions and the following disclaimer
|
| 1444 |
+
in the documentation and/or other materials provided with the
|
| 1445 |
+
distribution.
|
| 1446 |
+
|
| 1447 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
| 1448 |
+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
| 1449 |
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
| 1450 |
+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
| 1451 |
+
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
| 1452 |
+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
| 1453 |
+
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
| 1454 |
+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
| 1455 |
+
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
| 1456 |
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| 1457 |
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| 1458 |
+
|
| 1459 |
+
16. The NPP library uses code from the Boost Math Toolkit,
|
| 1460 |
+
and is subject to the following license:
|
| 1461 |
+
|
| 1462 |
+
Boost Software License - Version 1.0 - August 17th, 2003
|
| 1463 |
+
. . . .
|
| 1464 |
+
|
| 1465 |
+
Permission is hereby granted, free of charge, to any person or
|
| 1466 |
+
organization obtaining a copy of the software and accompanying
|
| 1467 |
+
documentation covered by this license (the "Software") to use,
|
| 1468 |
+
reproduce, display, distribute, execute, and transmit the Software,
|
| 1469 |
+
and to prepare derivative works of the Software, and to permit
|
| 1470 |
+
third-parties to whom the Software is furnished to do so, all
|
| 1471 |
+
subject to the following:
|
| 1472 |
+
|
| 1473 |
+
The copyright notices in the Software and this entire statement,
|
| 1474 |
+
including the above license grant, this restriction and the following
|
| 1475 |
+
disclaimer, must be included in all copies of the Software, in whole
|
| 1476 |
+
or in part, and all derivative works of the Software, unless such
|
| 1477 |
+
copies or derivative works are solely in the form of machine-executable
|
| 1478 |
+
object code generated by a source language processor.
|
| 1479 |
+
|
| 1480 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
| 1481 |
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
| 1482 |
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE AND
|
| 1483 |
+
NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
|
| 1484 |
+
ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE FOR ANY DAMAGES OR
|
| 1485 |
+
OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, ARISING
|
| 1486 |
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
| 1487 |
+
OTHER DEALINGS IN THE SOFTWARE.
|
| 1488 |
+
|
| 1489 |
+
17. Portions of the Nsight Eclipse Edition is subject to the
|
| 1490 |
+
following license:
|
| 1491 |
+
|
| 1492 |
+
The Eclipse Foundation makes available all content in this plug-in
|
| 1493 |
+
("Content"). Unless otherwise indicated below, the Content is provided
|
| 1494 |
+
to you under the terms and conditions of the Eclipse Public License
|
| 1495 |
+
Version 1.0 ("EPL"). A copy of the EPL is available at http://
|
| 1496 |
+
www.eclipse.org/legal/epl-v10.html. For purposes of the EPL, "Program"
|
| 1497 |
+
will mean the Content.
|
| 1498 |
+
|
| 1499 |
+
If you did not receive this Content directly from the Eclipse
|
| 1500 |
+
Foundation, the Content is being redistributed by another party
|
| 1501 |
+
("Redistributor") and different terms and conditions may apply to your
|
| 1502 |
+
use of any object code in the Content. Check the Redistributor's
|
| 1503 |
+
license that was provided with the Content. If no such license exists,
|
| 1504 |
+
contact the Redistributor. Unless otherwise indicated below, the terms
|
| 1505 |
+
and conditions of the EPL still apply to any source code in the
|
| 1506 |
+
Content and such source code may be obtained at http://www.eclipse.org.
|
| 1507 |
+
|
| 1508 |
+
18. Some of the cuBLAS library routines uses code from
|
| 1509 |
+
OpenAI, which is subject to the following license:
|
| 1510 |
+
|
| 1511 |
+
License URL
|
| 1512 |
+
https://github.com/openai/openai-gemm/blob/master/LICENSE
|
| 1513 |
+
|
| 1514 |
+
License Text
|
| 1515 |
+
The MIT License
|
| 1516 |
+
|
| 1517 |
+
Copyright (c) 2016 OpenAI (http://openai.com), 2016 Google Inc.
|
| 1518 |
+
|
| 1519 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 1520 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 1521 |
+
in the Software without restriction, including without limitation the rights
|
| 1522 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 1523 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 1524 |
+
furnished to do so, subject to the following conditions:
|
| 1525 |
+
|
| 1526 |
+
The above copyright notice and this permission notice shall be included in
|
| 1527 |
+
all copies or substantial portions of the Software.
|
| 1528 |
+
|
| 1529 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 1530 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 1531 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 1532 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 1533 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 1534 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
| 1535 |
+
THE SOFTWARE.
|
| 1536 |
+
|
| 1537 |
+
19. Licensee's use of the Visual Studio Setup Configuration
|
| 1538 |
+
Samples is subject to the following license:
|
| 1539 |
+
|
| 1540 |
+
The MIT License (MIT)
|
| 1541 |
+
Copyright (C) Microsoft Corporation. All rights reserved.
|
| 1542 |
+
|
| 1543 |
+
Permission is hereby granted, free of charge, to any person
|
| 1544 |
+
obtaining a copy of this software and associated documentation
|
| 1545 |
+
files (the "Software"), to deal in the Software without restriction,
|
| 1546 |
+
including without limitation the rights to use, copy, modify, merge,
|
| 1547 |
+
publish, distribute, sublicense, and/or sell copies of the Software,
|
| 1548 |
+
and to permit persons to whom the Software is furnished to do so,
|
| 1549 |
+
subject to the following conditions:
|
| 1550 |
+
|
| 1551 |
+
The above copyright notice and this permission notice shall be included
|
| 1552 |
+
in all copies or substantial portions of the Software.
|
| 1553 |
+
|
| 1554 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
| 1555 |
+
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 1556 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 1557 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 1558 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 1559 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
| 1560 |
+
|
| 1561 |
+
20. Licensee's use of linmath.h header for CPU functions for
|
| 1562 |
+
GL vector/matrix operations from lunarG is subject to the
|
| 1563 |
+
Apache License Version 2.0.
|
| 1564 |
+
|
| 1565 |
+
21. The DX12-CUDA sample uses the d3dx12.h header, which is
|
| 1566 |
+
subject to the MIT license .
|
| 1567 |
+
|
| 1568 |
+
-----------------
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia_curand_cu12-10.3.9.90.dist-info/METADATA
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.2
|
| 2 |
+
Name: nvidia-curand-cu12
|
| 3 |
+
Version: 10.3.9.90
|
| 4 |
+
Summary: CURAND native runtime libraries
|
| 5 |
+
Home-page: https://developer.nvidia.com/cuda-zone
|
| 6 |
+
Author: Nvidia CUDA Installer Team
|
| 7 |
+
Author-email: compute_installer@nvidia.com
|
| 8 |
+
License: NVIDIA Proprietary Software
|
| 9 |
+
Keywords: cuda,nvidia,runtime,machine learning,deep learning
|
| 10 |
+
Classifier: Development Status :: 4 - Beta
|
| 11 |
+
Classifier: Intended Audience :: Developers
|
| 12 |
+
Classifier: Intended Audience :: Education
|
| 13 |
+
Classifier: Intended Audience :: Science/Research
|
| 14 |
+
Classifier: License :: Other/Proprietary License
|
| 15 |
+
Classifier: Natural Language :: English
|
| 16 |
+
Classifier: Programming Language :: Python :: 3
|
| 17 |
+
Classifier: Programming Language :: Python :: 3.5
|
| 18 |
+
Classifier: Programming Language :: Python :: 3.6
|
| 19 |
+
Classifier: Programming Language :: Python :: 3.7
|
| 20 |
+
Classifier: Programming Language :: Python :: 3.8
|
| 21 |
+
Classifier: Programming Language :: Python :: 3.9
|
| 22 |
+
Classifier: Programming Language :: Python :: 3.10
|
| 23 |
+
Classifier: Programming Language :: Python :: 3.11
|
| 24 |
+
Classifier: Programming Language :: Python :: 3 :: Only
|
| 25 |
+
Classifier: Topic :: Scientific/Engineering
|
| 26 |
+
Classifier: Topic :: Scientific/Engineering :: Mathematics
|
| 27 |
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
| 28 |
+
Classifier: Topic :: Software Development
|
| 29 |
+
Classifier: Topic :: Software Development :: Libraries
|
| 30 |
+
Classifier: Operating System :: Microsoft :: Windows
|
| 31 |
+
Classifier: Operating System :: POSIX :: Linux
|
| 32 |
+
Requires-Python: >=3
|
| 33 |
+
License-File: License.txt
|
| 34 |
+
Dynamic: author
|
| 35 |
+
Dynamic: author-email
|
| 36 |
+
Dynamic: classifier
|
| 37 |
+
Dynamic: description
|
| 38 |
+
Dynamic: home-page
|
| 39 |
+
Dynamic: keywords
|
| 40 |
+
Dynamic: license
|
| 41 |
+
Dynamic: requires-python
|
| 42 |
+
Dynamic: summary
|
| 43 |
+
|
| 44 |
+
CURAND native runtime libraries
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia_curand_cu12-10.3.9.90.dist-info/RECORD
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
nvidia/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 2 |
+
nvidia/__pycache__/__init__.cpython-312.pyc,,
|
| 3 |
+
nvidia/curand/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 4 |
+
nvidia/curand/__pycache__/__init__.cpython-312.pyc,,
|
| 5 |
+
nvidia/curand/include/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 6 |
+
nvidia/curand/include/__pycache__/__init__.cpython-312.pyc,,
|
| 7 |
+
nvidia/curand/include/curand.h,sha256=strQ9idlRTQoBJy_hAbAT4pgkW6BKYg8p_nUjbb8BVw,44075
|
| 8 |
+
nvidia/curand/include/curand_discrete.h,sha256=2qD3BkI622XEu0444wVP7HeYkKAx0Rjr2HDhqU4SA7E,3486
|
| 9 |
+
nvidia/curand/include/curand_discrete2.h,sha256=ZrQTO5R9x83AMX88uq7M8M94DLSC5VEz0PAkfcwtQeg,10883
|
| 10 |
+
nvidia/curand/include/curand_globals.h,sha256=bES1Kx0NrATXk1DReMMkqWrB062nOnaAp39y22wViXU,3717
|
| 11 |
+
nvidia/curand/include/curand_kernel.h,sha256=SjfAeh13ybXIxiekcgczzua02kIAqETopJKRhYvCat8,53133
|
| 12 |
+
nvidia/curand/include/curand_lognormal.h,sha256=-X-iNkJSzWpAYYjogm689EJTZfzore9sxU7ObddljLk,28142
|
| 13 |
+
nvidia/curand/include/curand_mrg32k3a.h,sha256=ZVVREjGNsJQJ-3IzZZ_LKGtGteslicb8E0Aly49BKPs,170296
|
| 14 |
+
nvidia/curand/include/curand_mtgp32.h,sha256=Qhrmx0pHWF-P2Uu5bKwYE9ymEWq3c7qBzCITVMaKMfI,7845
|
| 15 |
+
nvidia/curand/include/curand_mtgp32_host.h,sha256=SXqzmSQkzTLSRJ4pojTg_TNCC3T-G89HdBK-boSDqr4,18274
|
| 16 |
+
nvidia/curand/include/curand_mtgp32_kernel.h,sha256=ajZnXr5ZXnQExElf6LPpigrrKPTmMIZbRyTEnJ-BDhw,13731
|
| 17 |
+
nvidia/curand/include/curand_mtgp32dc_p_11213.h,sha256=7_gGYUH47UugIAEt60vYH5nFa-QUwTpDwSEgLg9cZts,276889
|
| 18 |
+
nvidia/curand/include/curand_normal.h,sha256=lnmYVk2fn0oEVWOytdKhXrHL36GLCjMnB8OnZeCaYcA,26953
|
| 19 |
+
nvidia/curand/include/curand_normal_static.h,sha256=5K4iTC9AuSWCe1LVxuj_0y3BVjtp0bxO6hndv2rbmiw,4727
|
| 20 |
+
nvidia/curand/include/curand_philox4x32_x.h,sha256=T21IP-Rdg3_tSVU9Je4dLKuwEqE4ovfwi7r1hOY92Dw,7166
|
| 21 |
+
nvidia/curand/include/curand_poisson.h,sha256=KrhXOmO_D7aclnj8geIyHqdpSQwWHurS9V_pVtgzodM,25461
|
| 22 |
+
nvidia/curand/include/curand_precalc.h,sha256=I6NZdgT42fMm9qSCtP-rlOAqt4Zsqgal0ajktcPmEak,1392393
|
| 23 |
+
nvidia/curand/include/curand_uniform.h,sha256=gpmRgQu5r6ppgLTg60NXoDdVJS6wMUy6jC5bh8l04e8,17472
|
| 24 |
+
nvidia/curand/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
| 25 |
+
nvidia/curand/lib/__pycache__/__init__.cpython-312.pyc,,
|
| 26 |
+
nvidia/curand/lib/libcurand.so.10,sha256=-b6gOKJwO3IVcf1FopmomBQf2MsmSlkSY1yVEW9ZYP4,136749240
|
| 27 |
+
nvidia_curand_cu12-10.3.9.90.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
| 28 |
+
nvidia_curand_cu12-10.3.9.90.dist-info/License.txt,sha256=rW9YU_ugyg0VnQ9Y1JrkmDDC-Mk_epJki5zpCttMbM0,59262
|
| 29 |
+
nvidia_curand_cu12-10.3.9.90.dist-info/METADATA,sha256=fU3xSITD3i7JIsVG2ZXO5i-aDlIls-ry2JUVICEsv28,1684
|
| 30 |
+
nvidia_curand_cu12-10.3.9.90.dist-info/RECORD,,
|
| 31 |
+
nvidia_curand_cu12-10.3.9.90.dist-info/WHEEL,sha256=VtFLEVB-VX8niQT4kQ5pcQOOqiKvUvqfZe5V14HmU88,109
|
| 32 |
+
nvidia_curand_cu12-10.3.9.90.dist-info/top_level.txt,sha256=fTkAtiFuL16nUrB9ytDDtpytz2t0B4NvYTnRzwAhO14,7
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia_curand_cu12-10.3.9.90.dist-info/WHEEL
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Wheel-Version: 1.0
|
| 2 |
+
Generator: setuptools (75.8.0)
|
| 3 |
+
Root-Is-Purelib: true
|
| 4 |
+
Tag: py3-none-manylinux_2_27_x86_64
|
| 5 |
+
|
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/nvidia_curand_cu12-10.3.9.90.dist-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
nvidia
|