Spaces:

Ritori
/

play_with_baby_llama2

Sleeping

App Files Files Community

Ritori commited on Jul 24, 2023

Commit

613e5d1

•

1 Parent(s): f9ae417

Upload folder using huggingface_hub

Browse files

Files changed (39) hide show

.config/.last_opt_in_prompt.yaml +1 -0
.config/.last_survey_prompt.yaml +1 -0
.config/.last_update_check.json +1 -0
.config/active_config +1 -0
.config/config_sentinel +0 -0
.config/configurations/config_default +6 -0
.config/default_configs.db +0 -0
.config/gce +1 -0
.config/logs/2023.07.20/13.26.51.001473.log +596 -0
.config/logs/2023.07.20/13.27.17.369260.log +5 -0
.config/logs/2023.07.20/13.27.43.121533.log +169 -0
.config/logs/2023.07.20/13.27.50.747950.log +5 -0
.config/logs/2023.07.20/13.28.16.714039.log +8 -0
.config/logs/2023.07.20/13.28.17.509819.log +8 -0
.gitattributes +2 -0
LICENSE +21 -0
README.md +153 -7
baby_llama2.py +25 -0
configurator.py +47 -0
import subprocess.py +29 -0
model.bin +3 -0
model.py +370 -0
requirements.txt +8 -0
run +0 -0
run.c +490 -0
run.exe +0 -0
sample.py +69 -0
sample_data/README.md +19 -0
sample_data/anscombe.json +49 -0
sample_data/california_housing_test.csv +0 -0
sample_data/california_housing_train.csv +0 -0
sample_data/mnist_test.csv +3 -0
sample_data/mnist_train_small.csv +3 -0
test_all.py +53 -0
tinystories.py +166 -0
tokenizer.bin +3 -0
tokenizer.model +3 -0
tokenizer.py +65 -0
train.py +331 -0

.config/.last_opt_in_prompt.yaml ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

.config/.last_survey_prompt.yaml ADDED Viewed

	@@ -0,0 +1 @@


1	+ last_prompt_time: 1689859662.3183093

.config/.last_update_check.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"last_update_check_time": 1689859670.1382627, "last_update_check_revision": 20230714124024, "notifications": [], "last_nag_times": {}}

.config/active_config ADDED Viewed

	@@ -0,0 +1 @@


1	+ default

.config/config_sentinel ADDED Viewed

File without changes

.config/configurations/config_default ADDED Viewed

	@@ -0,0 +1,6 @@

+[component_manager]
+disable_update_check = true
+[compute]
+gce_metadata_read_timeout_sec = 0

.config/default_configs.db ADDED Viewed

Binary file (12.3 kB). View file

.config/gce ADDED Viewed

	@@ -0,0 +1 @@


1	+ False

.config/logs/2023.07.20/13.26.51.001473.log ADDED Viewed

	@@ -0,0 +1,596 @@

+2023-07-20 13:26:51,007 DEBUG    root            Loaded Command Group: ['gcloud', 'components']
+2023-07-20 13:26:51,012 DEBUG    root            Loaded Command Group: ['gcloud', 'components', 'update']
+2023-07-20 13:26:51,014 DEBUG    root            Running [gcloud.components.update] with arguments: [--allow-no-backup: "True", --compile-python: "True", --quiet: "True", COMPONENT-IDS:7: "['core', 'gcloud-deps', 'bq', 'gcloud', 'gcloud-crc32c', 'gsutil', 'anthoscli']"]
+2023-07-20 13:26:51,015 INFO     ___FILE_ONLY___ Beginning update. This process may take several minutes.
+2023-07-20 13:27:03,064 DEBUG    urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443
+2023-07-20 13:27:03,161 DEBUG    urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components-2.json HTTP/1.1" 200 208282
+2023-07-20 13:27:03,175 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,176 INFO     ___FILE_ONLY___
+Your current Google Cloud CLI version is: 439.0.0
+2023-07-20 13:27:03,176 INFO     ___FILE_ONLY___ Installing components from version: 439.0.0
+2023-07-20 13:27:03,176 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,176 DEBUG    root            Chosen display Format:table[box,title="These components will be removed."](details.display_name:label=Name:align=left,version.version_string:label=Version:align=right,data.size.size(zero="",min=1048576):label=Size:align=right)
+2023-07-20 13:27:03,177 DEBUG    root            Chosen display Format:table[box,title="These components will be updated."](details.display_name:label=Name:align=left,version.version_string:label=Version:align=right,data.size.size(zero="",min=1048576):label=Size:align=right)
+2023-07-20 13:27:03,177 DEBUG    root            Chosen display Format:table[box,title="These components will be installed."](details.display_name:label=Name:align=left,version.version_string:label=Version:align=right,data.size.size(zero="",min=1048576):label=Size:align=right)
+2023-07-20 13:27:03,184 INFO     ___FILE_ONLY___ ┌─────────────────────────────────────────────────────────────────────────────┐
+2023-07-20 13:27:03,184 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,184 INFO     ___FILE_ONLY___ │                     These components will be installed.                     │
+2023-07-20 13:27:03,184 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,184 INFO     ___FILE_ONLY___ ├─────────────────────────────────────────────────────┬────────────┬──────────┤
+2023-07-20 13:27:03,184 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,184 INFO     ___FILE_ONLY___ │                         Name                        │  Version   │   Size   │
+2023-07-20 13:27:03,184 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,184 INFO     ___FILE_ONLY___ ├─────────────────────────────────────────────────────┼────────────┼──────────┤
+2023-07-20 13:27:03,184 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,185 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,185 INFO     ___FILE_ONLY___ BigQuery Command Line Tool
+2023-07-20 13:27:03,185 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,185 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,185 INFO     ___FILE_ONLY___     2.0.94
+2023-07-20 13:27:03,185 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,185 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,185 INFO     ___FILE_ONLY___  1.6 MiB
+2023-07-20 13:27:03,185 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,185 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,185 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,185 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,185 INFO     ___FILE_ONLY___ BigQuery Command Line Tool (Platform Specific)
+2023-07-20 13:27:03,185 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,185 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,186 INFO     ___FILE_ONLY___     2.0.77
+2023-07-20 13:27:03,186 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,186 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,186 INFO     ___FILE_ONLY___  < 1 MiB
+2023-07-20 13:27:03,186 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,186 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,186 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,186 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,186 INFO     ___FILE_ONLY___ Bundled Python 3.9
+2023-07-20 13:27:03,186 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,186 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,186 INFO     ___FILE_ONLY___     3.9.16
+2023-07-20 13:27:03,186 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,186 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,186 INFO     ___FILE_ONLY___ 63.6 MiB
+2023-07-20 13:27:03,186 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,186 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,187 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,187 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,187 INFO     ___FILE_ONLY___ Cloud Storage Command Line Tool
+2023-07-20 13:27:03,187 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,187 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,187 INFO     ___FILE_ONLY___       5.25
+2023-07-20 13:27:03,187 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,187 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,187 INFO     ___FILE_ONLY___ 11.3 MiB
+2023-07-20 13:27:03,187 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,187 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,187 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,187 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,187 INFO     ___FILE_ONLY___ Cloud Storage Command Line Tool (Platform Specific)
+2023-07-20 13:27:03,187 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,187 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,188 INFO     ___FILE_ONLY___       5.13
+2023-07-20 13:27:03,188 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,188 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,188 INFO     ___FILE_ONLY___  < 1 MiB
+2023-07-20 13:27:03,188 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,188 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,188 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,188 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,188 INFO     ___FILE_ONLY___ Google Cloud CLI Core Libraries (Platform Specific)
+2023-07-20 13:27:03,188 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,188 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,188 INFO     ___FILE_ONLY___ 2022.09.20
+2023-07-20 13:27:03,188 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,188 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,188 INFO     ___FILE_ONLY___  < 1 MiB
+2023-07-20 13:27:03,189 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,189 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,189 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,189 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,189 INFO     ___FILE_ONLY___ Google Cloud CRC32C Hash Tool
+2023-07-20 13:27:03,189 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,189 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,189 INFO     ___FILE_ONLY___      1.0.0
+2023-07-20 13:27:03,189 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,189 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,189 INFO     ___FILE_ONLY___  1.2 MiB
+2023-07-20 13:27:03,189 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,189 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,189 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,189 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,189 INFO     ___FILE_ONLY___ anthoscli
+2023-07-20 13:27:03,190 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,190 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,190 INFO     ___FILE_ONLY___     0.2.37
+2023-07-20 13:27:03,190 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,190 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,190 INFO     ___FILE_ONLY___ 68.4 MiB
+2023-07-20 13:27:03,190 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,190 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,190 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,190 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,190 INFO     ___FILE_ONLY___ gcloud cli dependencies
+2023-07-20 13:27:03,190 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,190 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,190 INFO     ___FILE_ONLY___ 2021.04.16
+2023-07-20 13:27:03,190 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,190 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,191 INFO     ___FILE_ONLY___  < 1 MiB
+2023-07-20 13:27:03,191 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,191 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:03,191 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,191 INFO     ___FILE_ONLY___ └─────────────────────────────────────────────────────┴────────────┴──────────┘
+2023-07-20 13:27:03,191 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,191 INFO     ___FILE_ONLY___
+2023-07-20 13:27:03,195 DEBUG    urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443
+2023-07-20 13:27:03,295 DEBUG    urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/RELEASE_NOTES HTTP/1.1" 200 1040023
+2023-07-20 13:27:03,371 INFO     ___FILE_ONLY___ For the latest full release notes, please visit:
+  https://cloud.google.com/sdk/release_notes
+2023-07-20 13:27:03,374 INFO     ___FILE_ONLY___ ╔═════════════════════════════════════════���══════════════════╗
+2023-07-20 13:27:03,374 INFO     ___FILE_ONLY___ ╠═ Creating update staging area                             ═╣
+2023-07-20 13:27:03,374 INFO     ___FILE_ONLY___ ╚
+2023-07-20 13:27:03,374 INFO     ___FILE_ONLY___ ══════
+2023-07-20 13:27:03,375 INFO     ___FILE_ONLY___ ══════
+2023-07-20 13:27:03,375 INFO     ___FILE_ONLY___ ══════
+2023-07-20 13:27:03,613 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:03,657 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:03,683 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:03,711 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:03,747 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:03,775 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:03,809 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:03,855 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:03,905 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:03,984 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:04,145 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:04,298 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:04,421 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:04,467 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:04,527 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:04,565 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:04,607 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:04,649 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:04,717 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:04,761 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:04,802 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:04,845 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:04,888 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:04,946 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:04,989 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:05,036 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:05,083 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:05,123 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:05,182 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:05,239 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:05,295 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:05,338 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:05,387 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:05,439 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:05,497 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:05,547 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:05,603 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:05,663 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:05,715 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:05,761 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:05,812 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:05,859 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:05,860 INFO     ___FILE_ONLY___ ╝
+2023-07-20 13:27:05,944 INFO     ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗
+2023-07-20 13:27:05,944 INFO     ___FILE_ONLY___ ╠═ Installing: BigQuery Command Line Tool                   ═╣
+2023-07-20 13:27:05,944 INFO     ___FILE_ONLY___ ╚
+2023-07-20 13:27:05,949 DEBUG    urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443
+2023-07-20 13:27:06,056 DEBUG    urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components/google-cloud-sdk-bq-20230714124024.tar.gz HTTP/1.1" 200 1695339
+2023-07-20 13:27:06,126 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,126 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,126 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,126 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,126 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,127 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,127 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,127 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,127 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,127 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,127 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,127 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,128 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,128 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,128 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,128 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,128 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,129 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,129 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,129 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,129 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,129 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,129 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,129 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,130 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,130 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,130 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,130 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,130 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,130 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,252 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,256 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,260 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,264 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,267 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,271 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,276 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,280 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,284 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,287 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,291 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,295 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,301 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,304 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,307 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,311 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,316 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,320 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,326 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,330 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,335 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,342 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,348 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,352 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,356 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,361 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,365 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,369 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,373 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,376 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:06,377 INFO     ___FILE_ONLY___ ╝
+2023-07-20 13:27:06,393 INFO     ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗
+2023-07-20 13:27:06,393 INFO     ___FILE_ONLY___ ╠═ Installing: BigQuery Command Line Tool (Platform Spec... ═╣
+2023-07-20 13:27:06,393 INFO     ___FILE_ONLY___ ╚
+2023-07-20 13:27:06,398 DEBUG    urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443
+2023-07-20 13:27:06,499 DEBUG    urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components/google-cloud-sdk-bq-nix-20220920185015.tar.gz HTTP/1.1" 200 1837
+2023-07-20 13:27:06,500 INFO     ___FILE_ONLY___ ══════════════════════════════
+2023-07-20 13:27:06,501 INFO     ___FILE_ONLY___ ══════════════════════════════
+2023-07-20 13:27:06,501 INFO     ___FILE_ONLY___ ╝
+2023-07-20 13:27:06,510 INFO     ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗
+2023-07-20 13:27:06,510 INFO     ___FILE_ONLY___ ╠═ Installing: Bundled Python 3.9                           ═╣
+2023-07-20 13:27:06,510 INFO     ___FILE_ONLY___ ╚
+2023-07-20 13:27:06,514 INFO     ___FILE_ONLY___ ════════════════════════════════════════════════════════════
+2023-07-20 13:27:06,514 INFO     ___FILE_ONLY___ ╝
+2023-07-20 13:27:06,516 INFO     ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗
+2023-07-20 13:27:06,517 INFO     ___FILE_ONLY___ ╠═ Installing: Bundled Python 3.9                           ═╣
+2023-07-20 13:27:06,517 INFO     ___FILE_ONLY___ ╚
+2023-07-20 13:27:06,521 DEBUG    urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443
+2023-07-20 13:27:06,629 DEBUG    urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components/google-cloud-sdk-bundled-python3-unix-linux-x86_64-20230707144938.tar.gz HTTP/1.1" 200 66719069
+2023-07-20 13:27:07,238 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:07,241 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:07,243 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:07,246 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:07,249 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:07,251 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:07,254 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:07,256 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:07,259 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:07,261 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:07,264 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:07,266 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:07,269 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:07,272 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:07,274 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:07,277 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:07,279 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:07,282 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:07,285 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:07,288 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:07,290 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:07,293 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:07,296 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:07,299 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:07,301 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:07,304 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:07,307 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:07,309 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:07,312 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:07,314 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:09,215 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:09,227 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:09,242 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:09,255 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:09,267 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:09,284 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:09,302 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:09,323 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:09,340 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:09,352 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:09,371 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:09,391 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:09,406 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:09,528 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:09,542 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:09,637 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:09,659 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:09,680 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:09,698 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:09,734 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:09,748 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:09,761 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:09,777 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:09,794 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:09,813 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:09,831 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:10,450 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:10,740 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:10,751 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:10,763 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:10,763 INFO     ___FILE_ONLY___ ╝
+2023-07-20 13:27:10,809 INFO     ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗
+2023-07-20 13:27:10,809 INFO     ___FILE_ONLY___ ╠═ Installing: Cloud Storage Command Line Tool              ═╣
+2023-07-20 13:27:10,809 INFO     ___FILE_ONLY___ ╚
+2023-07-20 13:27:10,814 DEBUG    urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443
+2023-07-20 13:27:10,919 DEBUG    urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components/google-cloud-sdk-gsutil-20230707144938.tar.gz HTTP/1.1" 200 11823782
+2023-07-20 13:27:11,063 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,063 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,064 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,064 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,065 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,066 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,066 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,067 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,067 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,068 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,068 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,069 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,069 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,070 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,070 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,071 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,071 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,072 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,072 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,073 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,073 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,074 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,074 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,075 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,075 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,076 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,076 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,077 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,078 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,078 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,765 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,794 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,819 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,848 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,865 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,885 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,906 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,929 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,953 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:11,989 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:12,021 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:12,046 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:12,083 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:12,115 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:12,146 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:12,169 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:12,187 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:12,209 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:12,232 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:12,258 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:12,286 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:12,312 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:12,335 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:12,383 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:12,410 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:12,436 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:12,455 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:12,474 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:12,497 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:12,518 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:12,518 INFO     ___FILE_ONLY___ ╝
+2023-07-20 13:27:12,572 INFO     ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗
+2023-07-20 13:27:12,572 INFO     ___FILE_ONLY___ ╠═ Installing: Cloud Storage Command Line Tool (Platform... ═╣
+2023-07-20 13:27:12,572 INFO     ___FILE_ONLY___ ╚
+2023-07-20 13:27:12,577 DEBUG    urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443
+2023-07-20 13:27:12,674 DEBUG    urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components/google-cloud-sdk-gsutil-nix-20220920185015.tar.gz HTTP/1.1" 200 1851
+2023-07-20 13:27:12,675 INFO     ___FILE_ONLY___ ══════════════════════════════
+2023-07-20 13:27:12,676 INFO     ___FILE_ONLY___ ══════════════════════════════
+2023-07-20 13:27:12,676 INFO     ___FILE_ONLY___ ╝
+2023-07-20 13:27:12,685 INFO     ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗
+2023-07-20 13:27:12,685 INFO     ___FILE_ONLY___ ╠═ Installing: Default set of gcloud commands               ═╣
+2023-07-20 13:27:12,685 INFO     ___FILE_ONLY___ ╚
+2023-07-20 13:27:12,689 INFO     ___FILE_ONLY___ ════════════════════════════════════════════════════════════
+2023-07-20 13:27:12,689 INFO     ___FILE_ONLY___ ╝
+2023-07-20 13:27:12,691 INFO     ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗
+2023-07-20 13:27:12,692 INFO     ___FILE_ONLY___ ╠═ Installing: Google Cloud CLI Core Libraries (Platform... ═╣
+2023-07-20 13:27:12,692 INFO     ___FILE_ONLY___ ╚
+2023-07-20 13:27:12,696 DEBUG    urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443
+2023-07-20 13:27:12,795 DEBUG    urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components/google-cloud-sdk-core-nix-20220920185015.tar.gz HTTP/1.1" 200 2221
+2023-07-20 13:27:12,796 INFO     ___FILE_ONLY___ ══════════════════════════════
+2023-07-20 13:27:12,797 INFO     ___FILE_ONLY___ ═══════════════
+2023-07-20 13:27:12,797 INFO     ___FILE_ONLY___ ═══════════════
+2023-07-20 13:27:12,798 INFO     ___FILE_ONLY___ ╝
+2023-07-20 13:27:12,853 INFO     ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗
+2023-07-20 13:27:12,854 INFO     ___FILE_ONLY___ ╠═ Installing: Google Cloud CRC32C Hash Tool                ═╣
+2023-07-20 13:27:12,854 INFO     ___FILE_ONLY___ ╚
+2023-07-20 13:27:12,858 DEBUG    urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443
+2023-07-20 13:27:13,022 DEBUG    urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components/google-cloud-sdk-gcloud-crc32c-linux-x86_64-20230707144938.tar.gz HTTP/1.1" 200 1272169
+2023-07-20 13:27:13,082 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:13,082 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:13,082 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:13,082 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:13,082 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:13,083 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:13,083 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:13,083 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:13,083 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:13,083 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:13,083 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:13,083 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:13,083 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:13,084 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:13,084 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:13,084 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:13,084 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:13,084 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:13,084 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:13,084 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:13,084 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:13,085 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:13,085 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:13,085 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:13,085 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:13,085 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:13,085 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:13,085 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:13,085 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:13,085 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:13,119 INFO     ___FILE_ONLY___ ═══════════════
+2023-07-20 13:27:13,120 INFO     ___FILE_ONLY___ ═══════════════
+2023-07-20 13:27:13,120 INFO     ___FILE_ONLY___ ╝
+2023-07-20 13:27:13,129 INFO     ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗
+2023-07-20 13:27:13,129 INFO     ___FILE_ONLY___ ╠═ Installing: Google Cloud CRC32C Hash Tool                ═╣
+2023-07-20 13:27:13,129 INFO     ___FILE_ONLY___ ╚
+2023-07-20 13:27:13,134 INFO     ___FILE_ONLY___ ════════════════════════════════════════════════════════════
+2023-07-20 13:27:13,134 INFO     ___FILE_ONLY___ ╝
+2023-07-20 13:27:13,136 INFO     ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗
+2023-07-20 13:27:13,136 INFO     ___FILE_ONLY___ ╠═ Installing: anthoscli                                    ═╣
+2023-07-20 13:27:13,136 INFO     ___FILE_ONLY___ ╚
+2023-07-20 13:27:13,141 DEBUG    urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443
+2023-07-20 13:27:13,254 DEBUG    urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components/google-cloud-sdk-anthoscli-linux-x86_64-20230707144938.tar.gz HTTP/1.1" 200 71750428
+2023-07-20 13:27:14,274 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:14,277 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:14,280 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:14,283 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:14,286 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:14,289 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:14,292 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:14,295 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:14,298 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:14,301 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:14,304 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:14,307 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:14,309 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:14,312 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:14,315 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:14,318 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:14,321 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:14,323 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:14,326 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:14,329 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:14,331 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:14,334 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:14,337 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:14,340 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:14,342 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:14,345 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:14,348 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:14,351 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:14,354 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:14,357 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:16,587 INFO     ___FILE_ONLY___ ══════════
+2023-07-20 13:27:16,592 INFO     ___FILE_ONLY___ ═════════
+2023-07-20 13:27:16,619 INFO     ___FILE_ONLY___ ═══════════
+2023-07-20 13:27:16,620 INFO     ___FILE_ONLY___ ╝
+2023-07-20 13:27:16,641 INFO     ___FILE_ONLY___ ╔═════════════════════════════════��══════════════════════════╗
+2023-07-20 13:27:16,642 INFO     ___FILE_ONLY___ ╠═ Installing: anthoscli                                    ═╣
+2023-07-20 13:27:16,642 INFO     ___FILE_ONLY___ ╚
+2023-07-20 13:27:16,646 INFO     ___FILE_ONLY___ ════════════════════════════════════════════════════════════
+2023-07-20 13:27:16,647 INFO     ___FILE_ONLY___ ╝
+2023-07-20 13:27:16,649 INFO     ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗
+2023-07-20 13:27:16,649 INFO     ___FILE_ONLY___ ╠═ Installing: gcloud cli dependencies                      ═╣
+2023-07-20 13:27:16,649 INFO     ___FILE_ONLY___ ╚
+2023-07-20 13:27:16,654 DEBUG    urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443
+2023-07-20 13:27:16,756 DEBUG    urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components/google-cloud-sdk-gcloud-deps-linux-x86_64-20210416153011.tar.gz HTTP/1.1" 200 104
+2023-07-20 13:27:16,757 INFO     ___FILE_ONLY___ ══════════════════════════════
+2023-07-20 13:27:16,757 INFO     ___FILE_ONLY___ ══════════════════════════════
+2023-07-20 13:27:16,757 INFO     ___FILE_ONLY___ ╝
+2023-07-20 13:27:16,766 INFO     ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗
+2023-07-20 13:27:16,767 INFO     ___FILE_ONLY___ ╠═ Creating backup and activating new installation          ═╣
+2023-07-20 13:27:16,767 INFO     ___FILE_ONLY___ ╚
+2023-07-20 13:27:16,767 DEBUG    root            Attempting to move directory [/tools/google-cloud-sdk] to [/tools/google-cloud-sdk.staging/.install/.backup]
+2023-07-20 13:27:16,767 INFO     ___FILE_ONLY___ ══════════════════════════════
+2023-07-20 13:27:16,767 DEBUG    root            Attempting to move directory [/tools/google-cloud-sdk.staging] to [/tools/google-cloud-sdk]
+2023-07-20 13:27:16,767 INFO     ___FILE_ONLY___ ══════════════════════════════
+2023-07-20 13:27:16,767 INFO     ___FILE_ONLY___ ╝
+2023-07-20 13:27:16,771 DEBUG    root            Updating notification cache...
+2023-07-20 13:27:16,772 INFO     ___FILE_ONLY___
+2023-07-20 13:27:16,774 INFO     ___FILE_ONLY___ Performing post processing steps...
+2023-07-20 13:27:16,775 DEBUG    root            Executing command: ['python3', '-S', '/tools/google-cloud-sdk/lib/gcloud.py', 'components', 'post-process']
+2023-07-20 13:27:42,251 DEBUG    ___FILE_ONLY___
+2023-07-20 13:27:42,252 DEBUG    ___FILE_ONLY___
+2023-07-20 13:27:42,313 INFO     ___FILE_ONLY___
+Update done!
+2023-07-20 13:27:42,317 DEBUG    root            Chosen display Format:none
+2023-07-20 13:27:42,317 INFO     root            Display format: "none"

.config/logs/2023.07.20/13.27.17.369260.log ADDED Viewed

	@@ -0,0 +1,5 @@

+2023-07-20 13:27:17,370 DEBUG    root            Loaded Command Group: ['gcloud', 'components']
+2023-07-20 13:27:17,372 DEBUG    root            Loaded Command Group: ['gcloud', 'components', 'post_process']
+2023-07-20 13:27:17,375 DEBUG    root            Running [gcloud.components.post-process] with arguments: []
+2023-07-20 13:27:42,071 DEBUG    root            Chosen display Format:none
+2023-07-20 13:27:42,072 INFO     root            Display format: "none"

.config/logs/2023.07.20/13.27.43.121533.log ADDED Viewed

	@@ -0,0 +1,169 @@

+2023-07-20 13:27:43,122 DEBUG    root            Loaded Command Group: ['gcloud', 'components']
+2023-07-20 13:27:43,125 DEBUG    root            Loaded Command Group: ['gcloud', 'components', 'update']
+2023-07-20 13:27:43,128 DEBUG    root            Running [gcloud.components.update] with arguments: [--quiet: "True", COMPONENT-IDS:8: "['gcloud', 'core', 'bq', 'gsutil', 'compute', 'preview', 'alpha', 'beta']"]
+2023-07-20 13:27:43,129 INFO     ___FILE_ONLY___ Beginning update. This process may take several minutes.
+2023-07-20 13:27:43,135 DEBUG    urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443
+2023-07-20 13:27:43,233 DEBUG    urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components-2.json HTTP/1.1" 200 208282
+2023-07-20 13:27:43,252 WARNING  root            Component [preview] no longer exists.
+2023-07-20 13:27:43,252 WARNING  root            Component [compute] no longer exists.
+2023-07-20 13:27:43,253 INFO     ___FILE_ONLY___
+2023-07-20 13:27:43,254 INFO     ___FILE_ONLY___
+Your current Google Cloud CLI version is: 439.0.0
+2023-07-20 13:27:43,254 INFO     ___FILE_ONLY___ Installing components from version: 439.0.0
+2023-07-20 13:27:43,254 INFO     ___FILE_ONLY___
+2023-07-20 13:27:43,254 DEBUG    root            Chosen display Format:table[box,title="These components will be removed."](details.display_name:label=Name:align=left,version.version_string:label=Version:align=right,data.size.size(zero="",min=1048576):label=Size:align=right)
+2023-07-20 13:27:43,255 DEBUG    root            Chosen display Format:table[box,title="These components will be updated."](details.display_name:label=Name:align=left,version.version_string:label=Version:align=right,data.size.size(zero="",min=1048576):label=Size:align=right)
+2023-07-20 13:27:43,255 DEBUG    root            Chosen display Format:table[box,title="These components will be installed."](details.display_name:label=Name:align=left,version.version_string:label=Version:align=right,data.size.size(zero="",min=1048576):label=Size:align=right)
+2023-07-20 13:27:43,257 INFO     ___FILE_ONLY___ ┌──────────────────────────────────────────────┐
+2023-07-20 13:27:43,257 INFO     ___FILE_ONLY___
+2023-07-20 13:27:43,257 INFO     ___FILE_ONLY___ │     These components will be installed.      │
+2023-07-20 13:27:43,257 INFO     ___FILE_ONLY___
+2023-07-20 13:27:43,257 INFO     ___FILE_ONLY___ ├───────────────────────┬────────────┬─────────┤
+2023-07-20 13:27:43,257 INFO     ___FILE_ONLY___
+2023-07-20 13:27:43,257 INFO     ___FILE_ONLY___ │          Name         │  Version   │   Size  │
+2023-07-20 13:27:43,258 INFO     ___FILE_ONLY___
+2023-07-20 13:27:43,258 INFO     ___FILE_ONLY___ ├───────────────────────┼────────────┼─────────┤
+2023-07-20 13:27:43,258 INFO     ___FILE_ONLY___
+2023-07-20 13:27:43,258 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:43,258 INFO     ___FILE_ONLY___ gcloud Alpha Commands
+2023-07-20 13:27:43,258 INFO     ___FILE_ONLY___
+2023-07-20 13:27:43,258 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:43,258 INFO     ___FILE_ONLY___ 2023.07.14
+2023-07-20 13:27:43,258 INFO     ___FILE_ONLY___
+2023-07-20 13:27:43,258 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:43,258 INFO     ___FILE_ONLY___ < 1 MiB
+2023-07-20 13:27:43,258 INFO     ___FILE_ONLY___
+2023-07-20 13:27:43,258 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:43,258 INFO     ___FILE_ONLY___
+2023-07-20 13:27:43,258 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:43,258 INFO     ___FILE_ONLY___ gcloud Beta Commands
+2023-07-20 13:27:43,259 INFO     ___FILE_ONLY___
+2023-07-20 13:27:43,259 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:43,259 INFO     ___FILE_ONLY___ 2023.07.14
+2023-07-20 13:27:43,259 INFO     ___FILE_ONLY___
+2023-07-20 13:27:43,259 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:43,259 INFO     ___FILE_ONLY___ < 1 MiB
+2023-07-20 13:27:43,259 INFO     ___FILE_ONLY___
+2023-07-20 13:27:43,259 INFO     ___FILE_ONLY___ │
+2023-07-20 13:27:43,259 INFO     ___FILE_ONLY___
+2023-07-20 13:27:43,259 INFO     ___FILE_ONLY___ └───────────────────────┴────────────┴─────────┘
+2023-07-20 13:27:43,259 INFO     ___FILE_ONLY___
+2023-07-20 13:27:43,259 INFO     ___FILE_ONLY___
+2023-07-20 13:27:43,264 DEBUG    urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443
+2023-07-20 13:27:43,365 DEBUG    urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/RELEASE_NOTES HTTP/1.1" 200 1040023
+2023-07-20 13:27:43,443 INFO     ___FILE_ONLY___ For the latest full release notes, please visit:
+  https://cloud.google.com/sdk/release_notes
+2023-07-20 13:27:43,446 INFO     ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗
+2023-07-20 13:27:43,446 INFO     ___FILE_ONLY___ ╠═ Creating update staging area                             ═╣
+2023-07-20 13:27:43,446 INFO     ___FILE_ONLY___ ╚
+2023-07-20 13:27:43,446 INFO     ___FILE_ONLY___ ══════
+2023-07-20 13:27:44,031 INFO     ___FILE_ONLY___ ══════
+2023-07-20 13:27:44,031 INFO     ___FILE_ONLY___ ══════
+2023-07-20 13:27:44,433 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:44,633 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:44,749 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:44,845 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:45,073 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:45,164 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:45,230 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:45,332 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:45,399 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:45,473 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:45,576 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:45,636 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:45,727 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:45,900 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:45,999 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:46,032 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:46,071 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:46,105 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:46,146 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:46,227 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:46,276 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:46,366 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:46,606 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:46,736 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:46,934 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:47,011 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:47,069 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:47,132 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:47,189 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:47,234 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:47,283 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:47,332 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:47,388 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:47,461 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:47,523 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:47,574 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:47,629 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:47,794 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:47,847 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:47,901 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:47,980 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:48,034 INFO     ___FILE_ONLY___ ═
+2023-07-20 13:27:48,034 INFO     ___FILE_ONLY___ ╝
+2023-07-20 13:27:49,851 INFO     ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗
+2023-07-20 13:27:49,852 INFO     ___FILE_ONLY___ ╠═ Installing: gcloud Alpha Commands                        ═╣
+2023-07-20 13:27:49,852 INFO     ___FILE_ONLY___ ╚
+2023-07-20 13:27:49,856 DEBUG    urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443
+2023-07-20 13:27:50,010 DEBUG    urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components/google-cloud-sdk-alpha-20230714124024.tar.gz HTTP/1.1" 200 800
+2023-07-20 13:27:50,011 INFO     ___FILE_ONLY___ ══════════════════════════════
+2023-07-20 13:27:50,013 INFO     ___FILE_ONLY___ ══════════════════════════════
+2023-07-20 13:27:50,013 INFO     ___FILE_ONLY___ ╝
+2023-07-20 13:27:50,021 INFO     ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗
+2023-07-20 13:27:50,021 INFO     ___FILE_ONLY___ ╠═ Installing: gcloud Beta Commands                         ═╣
+2023-07-20 13:27:50,021 INFO     ___FILE_ONLY___ ╚
+2023-07-20 13:27:50,025 DEBUG    urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443
+2023-07-20 13:27:50,121 DEBUG    urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components/google-cloud-sdk-beta-20230714124024.tar.gz HTTP/1.1" 200 797
+2023-07-20 13:27:50,122 INFO     ___FILE_ONLY___ ══════════════════════════════
+2023-07-20 13:27:50,123 INFO     ___FILE_ONLY___ ══════════════════════════════
+2023-07-20 13:27:50,123 INFO     ___FILE_ONLY___ ╝
+2023-07-20 13:27:50,133 INFO     ___FILE_ONLY___ ��════════════════════════════════════════════════════════════╗
+2023-07-20 13:27:50,133 INFO     ___FILE_ONLY___ ╠═ Creating backup and activating new installation          ═╣
+2023-07-20 13:27:50,133 INFO     ___FILE_ONLY___ ╚
+2023-07-20 13:27:50,133 DEBUG    root            Attempting to move directory [/tools/google-cloud-sdk] to [/tools/google-cloud-sdk.staging/.install/.backup]
+2023-07-20 13:27:50,133 INFO     ___FILE_ONLY___ ══════════════════════════════
+2023-07-20 13:27:50,134 DEBUG    root            Attempting to move directory [/tools/google-cloud-sdk.staging] to [/tools/google-cloud-sdk]
+2023-07-20 13:27:50,134 INFO     ___FILE_ONLY___ ══════════════════════════════
+2023-07-20 13:27:50,134 INFO     ___FILE_ONLY___ ╝
+2023-07-20 13:27:50,138 DEBUG    root            Updating notification cache...
+2023-07-20 13:27:50,138 INFO     ___FILE_ONLY___
+2023-07-20 13:27:50,140 INFO     ___FILE_ONLY___ Performing post processing steps...
+2023-07-20 13:27:50,141 DEBUG    root            Executing command: ['python3', '-S', '/tools/google-cloud-sdk/lib/gcloud.py', 'components', 'post-process']
+2023-07-20 13:28:15,915 DEBUG    ___FILE_ONLY___
+2023-07-20 13:28:15,916 DEBUG    ___FILE_ONLY___
+2023-07-20 13:28:15,929 INFO     ___FILE_ONLY___
+Update done!
+2023-07-20 13:28:15,932 DEBUG    root            Chosen display Format:none
+2023-07-20 13:28:15,933 INFO     root            Display format: "none"

.config/logs/2023.07.20/13.27.50.747950.log ADDED Viewed

	@@ -0,0 +1,5 @@

+2023-07-20 13:27:50,749 DEBUG    root            Loaded Command Group: ['gcloud', 'components']
+2023-07-20 13:27:50,751 DEBUG    root            Loaded Command Group: ['gcloud', 'components', 'post_process']
+2023-07-20 13:27:50,753 DEBUG    root            Running [gcloud.components.post-process] with arguments: []
+2023-07-20 13:28:15,731 DEBUG    root            Chosen display Format:none
+2023-07-20 13:28:15,732 INFO     root            Display format: "none"

.config/logs/2023.07.20/13.28.16.714039.log ADDED Viewed

	@@ -0,0 +1,8 @@

+2023-07-20 13:28:16,716 DEBUG    root            Loaded Command Group: ['gcloud', 'config']
+2023-07-20 13:28:16,746 DEBUG    root            Loaded Command Group: ['gcloud', 'config', 'set']
+2023-07-20 13:28:16,749 DEBUG    root            Running [gcloud.config.set] with arguments: [SECTION/PROPERTY: "component_manager/disable_update_check", VALUE: "true"]
+2023-07-20 13:28:16,750 INFO     ___FILE_ONLY___ Updated property [component_manager/disable_update_check].
+2023-07-20 13:28:16,751 DEBUG    root            Chosen display Format:default
+2023-07-20 13:28:16,752 INFO     root            Display format: "default"
+2023-07-20 13:28:16,752 DEBUG    root            SDK update checks are disabled.

.config/logs/2023.07.20/13.28.17.509819.log ADDED Viewed

	@@ -0,0 +1,8 @@

+2023-07-20 13:28:17,511 DEBUG    root            Loaded Command Group: ['gcloud', 'config']
+2023-07-20 13:28:17,542 DEBUG    root            Loaded Command Group: ['gcloud', 'config', 'set']
+2023-07-20 13:28:17,545 DEBUG    root            Running [gcloud.config.set] with arguments: [SECTION/PROPERTY: "compute/gce_metadata_read_timeout_sec", VALUE: "0"]
+2023-07-20 13:28:17,546 INFO     ___FILE_ONLY___ Updated property [compute/gce_metadata_read_timeout_sec].
+2023-07-20 13:28:17,547 DEBUG    root            Chosen display Format:default
+2023-07-20 13:28:17,548 INFO     root            Display format: "default"
+2023-07-20 13:28:17,548 DEBUG    root            SDK update checks are disabled.

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+sample_data/mnist_test.csv filter=lfs diff=lfs merge=lfs -text
+sample_data/mnist_train_small.csv filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Andrej
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,12 +1,158 @@
 ---
-title: Play With Baby Llama2
-emoji: 😻
-colorFrom: yellow
-colorTo: indigo
 sdk: gradio
 sdk_version: 3.38.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: play_with_baby_llama2
+app_file: baby_llama2.py
 sdk: gradio
 sdk_version: 3.38.0
 ---
+## llama2.c
+Have you ever wanted to inference a baby [Llama 2](https://ai.meta.com/llama/) model in pure C? No? Well, now you can!
+<img src="assets/llama_cute.jpg" width="300" height="300">
+With this code you can train the Llama 2 LLM architecture from scratch in PyTorch, then save the weights to a raw binary file, then load that into one ~simple 500-line C file ([run.c](run.c)) that inferences the model, simply in fp32 for now. On my cloud Linux devbox a dim 288 6-layer 6-head model (~15M params) inferences at ~100 tok/s in fp32, and about the same on my M1 MacBook Air. I was somewhat pleasantly surprised that one can run reasonably sized models (few ten million params) at highly interactive rates with an approach this simple.
+Please note that this is just a weekend project: I took nanoGPT, tuned it to implement the Llama-2 architecture instead of GPT-2, and the meat of it was writing the C inference engine in [run.c](run.c). As such, this is not really meant to be a production-grade library right now.
+Hat tip to [llama.cpp](https://github.com/ggerganov/llama.cpp) for inspiring this project. I wanted something super minimal so I chose to hard-code the llama-2 architecture, stick to fp32, and just roll one inference file of pure C with no dependencies.
+## feel the magic
+Let's just run a baby Llama 2 model in C. You need a model checkpoint. Download this 15M parameter model I trained on the [TinyStories](https://huggingface.co/datasets/roneneldan/TinyStories) dataset (~58MB download) and place it into the default checkpoint directory `out`:
+```bash
+wget https://karpathy.ai/llama2c/model.bin -P out
+```
+(if that doesn't work try [google drive](https://drive.google.com/file/d/1aTimLdx3JktDXxcHySNrZJOOk8Vb1qBR/view?usp=share_link)). Compile and run the C code:
+```bash
+gcc -O3 -o run run.c -lm
+./run out/model.bin
+```
+You'll see the text stream a sample. On my M1 MacBook Air this runs at ~100 tokens/s, not bad for super naive fp32 single-threaded C code. See [performance](#performance) for compile flags that can significantly speed this up. Sample output:
+*Once upon a time, there was a boy named Timmy. Timmy loved to play sports with his friends. He was very good at throwing and catching balls. One day, Timmy's mom gave him a new shirt to wear to a party. Timmy thought it was impressive and asked his mom to explain what a shirt could be for. "A shirt is like a special suit for a basketball game," his mom said. Timmy was happy to hear that and put on his new shirt. He felt like a soldier going to the army and shouting. From that day on, Timmy wore his new shirt every time he played sports with his friends at the party. Once upon a time, there was a little girl named Lily. She loved to play outside with her friends. One day, Lily and her friend Emma were playing with a ball. Emma threw the ball too hard and it hit Lily's face. Lily felt embarrassed and didn't want to play anymore.
+Emma asked Lily what was wrong, and Lily told her about her memory. Emma told Lily that she was embarrassed because she had thrown the ball too hard. Lily felt bad
+achieved tok/s: 98.746993347843922*
+**Update**: I've now also uploaded a bigger checkpoint. This one is dim 512, 8 layers, 8 heads and context length 1024, a ~44M param Transformer. It trained for 200K iterations batch size 32 on 4XA100 40GB GPUs in ~8 hours. You can use this bigger and more powerful checkpoint like so:
+```bash
+wget https://karpathy.ai/llama2c/model44m.bin -P out44m
+./run out44m/model44m.bin
+```
+On my MacBook Air compiled with $ gcc -Ofast -o run run.c -lm this ran at ~150 tok/s. Still way too fast! I have to train an even bigger checkpoint... This model samples more coherent and diverse stories:
+*Once upon a time, there was a little girl named Lily. She loved playing with her toys on top of her bed. One day, she decided to have a tea party with her stuffed animals. She poured some tea into a tiny teapot and put it on top of the teapot. Suddenly, her little brother Max came into the room and wanted to join the tea party too. Lily didn't want to share her tea and she told Max to go away. Max started to cry and Lily felt bad. She decided to yield her tea party to Max and they both shared the teapot. But then, something unexpected happened. The teapot started to shake and wiggle. Lily and Max were scared and didn't know what to do. Suddenly, the teapot started to fly towards the ceiling and landed on the top of the bed. Lily and Max were amazed and they hugged each other. They realized that sharing was much more fun than being selfish. From that day on, they always shared their tea parties and toys.*
+## howto
+It should be possible to load the weights released by Meta but I haven't tried because the inference speed, even of the 7B model, would probably be not great with this baby single-threaded C program. So in this repo we focus on more narrow applications, and train the same architecture but from scratch, in this case on the TinyStories dataset for fun.
+First let's download and pretokenize some source dataset, e.g. I like [TinyStories](https://huggingface.co/datasets/roneneldan/TinyStories) so this is the only example currently available in this repo. But it should be very easy to add datasets, see the code.
+```bash
+python tinystories.py download
+python tinystories.py pretokenize
+```
+Then train our model:
+```bash
+python train.py
+```
+See the train.py script for more exotic launches and hyperparameter overrides. I didn't tune the hyperparameters, I expect simple hyperparameter exploration should give better models. Totally understand if you want to skip model training, for simple demo just download my pretrained model and save it into the directory `out`:
+```bash
+wget https://karpathy.ai/llama2c/model.bin -P out
+```
+Once we have the model.bin file, we can inference in C. Compile the C code first:
+```bash
+gcc -O3 -o run run.c -lm
+```
+You can now run it simply as
+```bash
+./run out/model.bin
+```
+Watch the tokens stream by, fun! We can also run the PyTorch inference script for comparison (to run, add [model.ckpt](https://drive.google.com/file/d/1SM0rMxzy7babB-v4MfTg1GFqOCgWar5w/view?usp=share_link) to /out if you haven't already):
+```bash
+python sample.py
+```
+Which gives the same results. More detailed testing will be done in `test_all.py`, run as:
+```bash
+$ pytest
+```
+Currently you will need two files to test or sample: the [model.bin](https://drive.google.com/file/d/1aTimLdx3JktDXxcHySNrZJOOk8Vb1qBR/view?usp=share_link) file and the [model.ckpt](https://drive.google.com/file/d/1SM0rMxzy7babB-v4MfTg1GFqOCgWar5w/view?usp=share_link) file from PyTorch training I ran earlier. I have to think through running the tests without having to download 200MB of data.
+## performance
+*(NOTE: this guide is not great because I personally spend a lot of my time in Python land and don't have an amazing understanding of a lot of these features and flags. If someone does and is willing to help document and briefly describe some of these and their tradeoffs, I'd welcome a PR)*
+There are many ways to potentially speed up this code depending on your system. Here we document a few together with a high-level guide on what they do. Here's again the default way to compile, but using -O3:
+```bash
+gcc -O3 -o run run.c -lm
+```
+-O3 includes optimizations that are expensive in terms of compile time and memory usage. Including vectorization, loop unrolling, and predicting branches. Here's a few more to try.
+`-Ofast` Run additional optimizations which may break compliance with the C/IEEE specifications, in addition to `-O3`. See [the GCC docs](https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html) for more information.
+`-ffast-math` breaks IEEE compliance, e.g. allowing reordering of operations, disables a bunch of checks for e.g. NaNs (assuming they don't happen), enables reciprocal approximations, disables signed zero, etc. However, there is a good reason to be suspicious of this setting, one good writeup is here: ["Beware of fast-math"](https://simonbyrne.github.io/notes/fastmath/).
+`-funsafe-math-optimizations` a more limited form of -ffast-math, that still breaks IEEE compliance but doesn't have all of the numeric/error handling changes from `-ffasth-math`. See [the GCC docs](https://gcc.gnu.org/wiki/FloatingPointMath) for more information.
+`-march=native` Compile the program to use the architecture of the machine you're compiling on rather than a more generic CPU. This may enable additional optimizations and hardware-specific tuning such as improved vector instructions/width.
+Putting a few of these together, the fastest throughput I saw so far on my MacBook Air (M1) is with:
+```bash
+gcc -Ofast -o run run.c -lm
+```
+Also, I saw someone report higher throughput replacing `gcc` with `clang`.
+**OpenMP** Big improvements can also be achieved by compiling with OpenMP, which "activates" the `#pragma omp parallel for` inside the matmul. You can compile e.g. like so:
+```bash
+clang -Ofast -fopenmp -march=native run.c  -lm  -o run
+```
+(I believe you can swap clang/gcc, and may try to leave out -march=native). Then when you run inference, make sure to use OpenMP flags to set the number of threads, e.g.:
+```bash
+OMP_NUM_THREADS=4 ./run out/model.bin
+```
+Depending on your system resources you may want to tweak these hyperparameters. (TODO: I am not intimitely familiar with OpenMP and its configuration, if someone would like to flesh out this section I would welcome a PR).
+## unsorted todos
+- why is there a leading space in C sampling code when we `./run`?
+- todo multiquery support? doesn't seem as useful for smaller models that run on CPU (?)
+- todo support inferencing beyond max_seq_len steps, have to think through the kv cache
+- why is MFU so low (~10%) on my A100 40GB for training?
+- weird errors with torch.compile and wandb when using DDP
+- make more better tests to decrease yolo
+## ack
+I trained the llama2.c storyteller models on a 4X A100 40GB box graciously provided by the excellent [Lambda labs](https://lambdalabs.com/service/gpu-cloud), thank you.
+## License
+MIT

baby_llama2.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import subprocess
+import gradio as gr
+from translate import Translator
+def generate_text():
+    cmd = ['./run', 'model.bin']
+    result = subprocess.run(cmd, stdout=subprocess.PIPE, text=True)
+    translator= Translator(from_lang='en', to_lang='zh-cn')
+    # Split the output into sentences
+    sentences = result.stdout.split('. ')
+    # Translate each sentence and join them back together
+    translation = '. '.join(translator.translate(sentence) for sentence in sentences)
+    return translation
+iface = gr.Interface(
+    fn=generate_text,
+    inputs=[],
+    outputs="text",
+    submit_label="开始生成",
+    title="和小羊驼一起玩"
+)
+iface.launch()

configurator.py ADDED Viewed

	@@ -0,0 +1,47 @@

+"""
+Poor Man's Configurator. Probably a terrible idea. Example usage:
+$ python train.py config/override_file.py --batch_size=32
+this will first run config/override_file.py, then override batch_size to 32
+The code in this file will be run as follows from e.g. train.py:
+>>> exec(open('configurator.py').read())
+So it's not a Python module, it's just shuttling this code away from train.py
+The code in this script then overrides the globals()
+I know people are not going to love this, I just really dislike configuration
+complexity and having to prepend config. to every single variable. If someone
+comes up with a better simple Python solution I am all ears.
+"""
+import sys
+from ast import literal_eval
+for arg in sys.argv[1:]:
+    if '=' not in arg:
+        # assume it's the name of a config file
+        assert not arg.startswith('--')
+        config_file = arg
+        print(f"Overriding config with {config_file}:")
+        with open(config_file) as f:
+            print(f.read())
+        exec(open(config_file).read())
+    else:
+        # assume it's a --key=value argument
+        assert arg.startswith('--')
+        key, val = arg.split('=')
+        key = key[2:]
+        if key in globals():
+            try:
+                # attempt to eval it it (e.g. if bool, number, or etc)
+                attempt = literal_eval(val)
+            except (SyntaxError, ValueError):
+                # if that goes wrong, just use the string
+                attempt = val
+            # ensure the types match ok
+            assert type(attempt) == type(globals()[key])
+            # cross fingers
+            print(f"Overriding: {key} = {attempt}")
+            globals()[key] = attempt
+        else:
+            raise ValueError(f"Unknown config key: {key}")

import subprocess.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import subprocess
+# Define the command as a list
+import gradio as gr
+def generate_text():
+    cmd = ['./run', 'model.bin']
+    # Use subprocess.run to execute the command
+    result = subprocess.run(cmd, stdout=subprocess.PIPE, text=True)
+    # 在这里运行你的模型，生成文本
+    # 以下是一个示例
+    text = "这是生成的文本。"
+    title="运行来让小羊驼跑起来"
+    return result
+iface = gr.Interface(fn=generate_text, inputs=[], outputs="text",submit_label="让小羊驼跑起来",title="和小羊驼一起玩")
+iface.launch()
+cmd = ['./run', 'model.bin']
+# Use subprocess.run to execute the command
+result = subprocess.run(cmd, stdout=subprocess.PIPE, text=True)
+# Print the output
+print(result.stdout)

model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd590644d963867a2b6e5a1107f51fad663c41d79c149fbecbbb1f95fa81f49a
+size 60816028

model.py ADDED Viewed

	@@ -0,0 +1,370 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+import math
+import struct
+import inspect
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+@dataclass
+class ModelArgs:
+    dim: int = 4096
+    n_layers: int = 32
+    n_heads: int = 32
+    n_kv_heads: Optional[int] = None
+    vocab_size: int = -1  # defined later by tokenizer
+    multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
+    norm_eps: float = 1e-5
+    max_seq_len: int = 2048
+    dropout: float = 0.0
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device)  # type: ignore
+    freqs = torch.outer(t, freqs).float()  # type: ignore
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
+    return freqs_cis
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+    assert freqs_cis.shape == (x.shape[1], x.shape[-1])
+    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+    return freqs_cis.view(*shape)
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """torch.repeat_interleave(x, dim=2, repeats=n_rep)"""
+    bs, slen, n_kv_heads, head_dim = x.shape
+    if n_rep == 1:
+        return x
+    return (
+        x[:, :, :, None, :]
+        .expand(bs, slen, n_kv_heads, n_rep, head_dim)
+        .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
+    )
+class Attention(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads
+        model_parallel_size = 1
+        self.n_local_heads = args.n_heads // model_parallel_size
+        self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
+        self.n_rep = self.n_local_heads // self.n_local_kv_heads
+        self.head_dim = args.dim // args.n_heads
+        self.wq = nn.Linear(args.dim, args.n_heads * self.head_dim, bias=False)
+        self.wk = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
+        self.wv = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
+        self.wo = nn.Linear(args.n_heads * self.head_dim, args.dim, bias=False)
+        self.attn_dropout = nn.Dropout(args.dropout)
+        self.resid_dropout = nn.Dropout(args.dropout)
+        self.dropout = args.dropout
+        # use flash attention or a manual implementation?
+        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
+        if not self.flash:
+            print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
+            mask = torch.full((1, 1, args.max_seq_len, args.max_seq_len), float("-inf"))
+            mask = torch.triu(mask, diagonal=1)
+            self.register_buffer("mask", mask)
+    def forward(
+        self,
+        x: torch.Tensor,
+        freqs_cis: torch.Tensor,
+    ):
+        bsz, seqlen, _ = x.shape
+        # QKV
+        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
+        xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+        xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+        xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+        # RoPE relative positional embeddings
+        xq, xk = apply_rotary_emb(xq, xk, freqs_cis)
+        # grouped multiquery attention: expand out keys and values
+        xk = repeat_kv(xk, self.n_rep)  # (bs, seqlen, n_local_heads, head_dim)
+        xv = repeat_kv(xv, self.n_rep)  # (bs, seqlen, n_local_heads, head_dim)
+        # make heads into a batch dimension
+        xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
+        xk = xk.transpose(1, 2)
+        xv = xv.transpose(1, 2)
+        # flash implementation
+        if self.flash:
+            output = torch.nn.functional.scaled_dot_product_attention(xq, xk, xv, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
+        else:
+            # manual implementation
+            scores = torch.matmul(xq, xk.transpose(2, 3)) / math.sqrt(self.head_dim)
+            scores = scores + self.mask[:, :, :seqlen, :seqlen]   # (bs, n_local_heads, seqlen, cache_len + seqlen)
+            scores = F.softmax(scores.float(), dim=-1).type_as(xq)
+            scores = self.attn_dropout(scores)
+            output = torch.matmul(scores, xv)  # (bs, n_local_heads, seqlen, head_dim)
+        # restore time as batch dimension and concat heads
+        output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
+        # final projection into the residual stream
+        output = self.wo(output)
+        output = self.resid_dropout(output)
+        return output
+class FeedForward(nn.Module):
+    def __init__(self, dim: int, hidden_dim: int, multiple_of: int, dropout: float):
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        return self.dropout(self.w2(F.silu(self.w1(x)) * self.w3(x)))
+class TransformerBlock(nn.Module):
+    def __init__(self, layer_id: int, args: ModelArgs):
+        super().__init__()
+        self.n_heads = args.n_heads
+        self.dim = args.dim
+        self.head_dim = args.dim // args.n_heads
+        self.attention = Attention(args)
+        self.feed_forward = FeedForward(
+            dim=args.dim,
+            hidden_dim=4 * args.dim,
+            multiple_of=args.multiple_of,
+            dropout=args.dropout,
+        )
+        self.layer_id = layer_id
+        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
+        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
+    def forward(self, x, freqs_cis):
+        h = x + self.attention.forward(self.attention_norm(x), freqs_cis)
+        out = h + self.feed_forward.forward(self.ffn_norm(h))
+        return out
+class Transformer(nn.Module):
+    def __init__(self, params: ModelArgs):
+        super().__init__()
+        self.params = params
+        self.vocab_size = params.vocab_size
+        self.n_layers = params.n_layers
+        self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim)
+        self.dropout = nn.Dropout(params.dropout)
+        self.layers = torch.nn.ModuleList()
+        for layer_id in range(params.n_layers):
+            self.layers.append(TransformerBlock(layer_id, params))
+        self.norm = RMSNorm(params.dim, eps=params.norm_eps)
+        self.output = nn.Linear(params.dim, params.vocab_size, bias=False)
+        # share the unembedding parameters with the embedding parameters
+        self.tok_embeddings.weight = self.output.weight # https://paperswithcode.com/method/weight-tying
+        # some useful precompute for the RoPE relative positional embeddings. TODO why * 2 here? confuse
+        freqs_cis = precompute_freqs_cis(self.params.dim // self.params.n_heads, self.params.max_seq_len * 2)
+        self.register_buffer("freqs_cis", freqs_cis, persistent=False)
+        # init all weights
+        self.apply(self._init_weights)
+        # apply special scaled init to the residual projections, per GPT-2 paper
+        for pn, p in self.named_parameters():
+            if pn.endswith('w3.weight') or pn.endswith('wo.weight'):
+                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * params.n_layers))
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(self, tokens, targets=None):
+        _bsz, seqlen = tokens.shape
+        h = self.tok_embeddings(tokens)
+        h = self.dropout(h)
+        freqs_cis = self.freqs_cis[:seqlen]
+        for layer in self.layers:
+            h = layer(h, freqs_cis)
+        h = self.norm(h)
+        if targets is not None:
+            # if we are given some desired targets also calculate the loss
+            logits = self.output(h)
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
+        else:
+            # inference-time mini-optimization: only forward the output on the very last position
+            logits = self.output(h[:, [-1], :]) # note: using list [-1] to preserve the time dim
+            loss = None
+        return logits, loss
+    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
+        # start with all of the candidate parameters
+        param_dict = {pn: p for pn, p in self.named_parameters()}
+        # filter out those that do not require grad
+        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
+        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
+        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
+        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
+        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
+        optim_groups = [
+            {'params': decay_params, 'weight_decay': weight_decay},
+            {'params': nodecay_params, 'weight_decay': 0.0}
+        ]
+        num_decay_params = sum(p.numel() for p in decay_params)
+        num_nodecay_params = sum(p.numel() for p in nodecay_params)
+        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
+        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
+        # Create AdamW optimizer and use the fused version if it is available
+        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
+        use_fused = fused_available and device_type == 'cuda'
+        extra_args = dict(fused=True) if use_fused else dict()
+        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
+        print(f"using fused AdamW: {use_fused}")
+        return optimizer
+    def estimate_mfu(self, fwdbwd_per_iter, dt):
+        """ estimate model flops utilization (MFU) in units of A100 bfloat16 peak FLOPS """
+        # first estimate the number of flops we do per iteration.
+        # see PaLM paper Appendix B as ref: https://arxiv.org/abs/2204.02311
+        N = sum(p.numel() for p in self.parameters())
+        cfg = self.params
+        L, H, Q, T = cfg.n_layers, cfg.n_heads, cfg.dim//cfg.n_heads, cfg.max_seq_len
+        flops_per_token = 6*N + 12*L*H*Q*T
+        flops_per_fwdbwd = flops_per_token * T
+        flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter
+        # express our flops throughput as ratio of A100 bfloat16 peak flops
+        flops_achieved = flops_per_iter * (1.0/dt) # per second
+        flops_promised = 312e12 # A100 GPU bfloat16 peak flops is 312 TFLOPS
+        mfu = flops_achieved / flops_promised
+        return mfu
+    @torch.inference_mode()
+    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
+        """
+        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
+        the sequence max_new_tokens times, feeding the predictions back into the model each time.
+        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
+        Also note this is a super inefficient version of sampling with no key/value cache.
+        """
+        for _ in range(max_new_tokens):
+            # if the sequence context is growing too long we must crop it at block_size
+            idx_cond = idx if idx.size(1) <= self.params.max_seq_len else idx[:, -self.params.max_seq_len:]
+            # forward the model to get the logits for the index in the sequence
+            logits, _ = self(idx_cond)
+            logits = logits[:, -1, :] # crop to just the final time step
+            if temperature == 0.0:
+                # "sample" the single most likely index
+                _, idx_next = torch.topk(logits, k=1, dim=-1)
+            else:
+                # pluck the logits at the final step and scale by desired temperature
+                logits = logits / temperature
+                # optionally crop the logits to only the top k options
+                if top_k is not None:
+                    v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                    logits[logits < v[:, [-1]]] = -float('Inf')
+                # apply softmax to convert logits to (normalized) probabilities
+                probs = F.softmax(logits, dim=-1)
+                idx_next = torch.multinomial(probs, num_samples=1)
+            # append sampled index to the running sequence and continue
+            idx = torch.cat((idx, idx_next), dim=1)
+        return idx
+    def export(self, filepath='model.bin'):
+        """export the model weights in fp32 into .bin file to be read from C"""
+        f = open(filepath, 'wb')
+        def serialize(t):
+            d = t.detach().cpu().view(-1).numpy().astype(np.float32)
+            b = struct.pack(f'{len(d)}f', *d)
+            f.write(b)
+        # first write out the header
+        hidden_dim = self.layers[0].feed_forward.w1.weight.shape[0]
+        p = self.params
+        n_kv_heads = p.n_heads if p.n_kv_heads is None else p.n_kv_heads
+        header = struct.pack('iiiiiii', p.dim, hidden_dim, p.n_layers, p.n_heads,
+                                       n_kv_heads, p.vocab_size, p.max_seq_len)
+        f.write(header)
+        # next write out the embedding weights
+        serialize(self.tok_embeddings.weight)
+        # now all the layers
+        # attention weights
+        for layer in self.layers:
+            serialize(layer.attention_norm.weight)
+        for layer in self.layers:
+            serialize(layer.attention.wq.weight)
+        for layer in self.layers:
+            serialize(layer.attention.wk.weight)
+        for layer in self.layers:
+            serialize(layer.attention.wv.weight)
+        for layer in self.layers:
+            serialize(layer.attention.wo.weight)
+        # ffn weights
+        for layer in self.layers:
+            serialize(layer.ffn_norm.weight)
+        for layer in self.layers:
+            serialize(layer.feed_forward.w1.weight)
+        for layer in self.layers:
+            serialize(layer.feed_forward.w2.weight)
+        for layer in self.layers:
+            serialize(layer.feed_forward.w3.weight)
+        # final rmsnorm
+        serialize(self.norm.weight)
+        # note: no need to write final classifier weights due to weight sharing
+        # freqs_cis
+        serialize(self.freqs_cis.real[:p.max_seq_len])
+        serialize(self.freqs_cis.imag[:p.max_seq_len])
+        # write to binary file
+        f.close()
+        print(f"wrote {filepath}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+numpy==1.23.5
+pytest==7.4.0
+Requests==2.31.0
+sentencepiece==0.1.99
+tiktoken==0.3.3
+torch==2.0.1
+tqdm==4.64.1
+wandb==0.15.5

run ADDED Viewed

Binary file (29.7 kB). View file

run.c ADDED Viewed

	@@ -0,0 +1,490 @@

+/*
+Inference for Llama-2 Transformer model in pure C.
+Example compile: (see README for more details)
+$ gcc -O3 -o run run.c -lm
+Then run with:
+$ ./run
+*/
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <math.h>
+#include <string.h>
+#include <sys/time.h>
+// ----------------------------------------------------------------------------
+// Transformer and RunState structs, and related memory management
+typedef struct {
+    int dim; // transformer dimension
+    int hidden_dim; // for ffn layers
+    int n_layers; // number of layers
+    int n_heads; // number of query heads
+    int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery)
+    int vocab_size; // vocabulary size, usually 256 (byte-level)
+    int seq_len; // max sequence length
+} Config;
+typedef struct {
+    // token embedding table
+    float* token_embedding_table;    // (vocab_size, dim)
+    // weights for rmsnorms
+    float* rms_att_weight; // (layer, dim) rmsnorm weights
+    float* rms_ffn_weight; // (layer, dim)
+    // weights for matmuls
+    float* wq; // (layer, dim, dim)
+    float* wk; // (layer, dim, dim)
+    float* wv; // (layer, dim, dim)
+    float* wo; // (layer, dim, dim)
+    // weights for ffn
+    float* w1; // (layer, hidden_dim, dim)
+    float* w2; // (layer, dim, hidden_dim)
+    float* w3; // (layer, hidden_dim, dim)
+    // final rmsnorm
+    float* rms_final_weight; // (dim,)
+    // freq_cis for RoPE relatively positional embeddings
+    float* freq_cis_real; // (seq_len, dim/2)
+    float* freq_cis_imag; // (seq_len, dim/2)
+} TransformerWeights;
+typedef struct {
+    // current wave of activations
+    float *x; // activation at current time stamp (dim,)
+    float *xb; // same, but inside a residual branch (dim,)
+    float *xb2; // an additional buffer just for convenience (dim,)
+    float *hb; // buffer for hidden dimension in the ffn (hidden_dim,)
+    float *hb2; // buffer for hidden dimension in the ffn (hidden_dim,)
+    float *q; // query (dim,)
+    float *k; // key (dim,)
+    float *v; // value (dim,)
+    float *att; // buffer for scores/attention values (seq_len,)
+    float *logits; // output logits
+    // kv cache
+    float* key_cache;   // (layer, seq_len, dim)
+    float* value_cache; // (layer, seq_len, dim)
+} RunState;
+void malloc_run_state(RunState* s, Config* p) {
+    // we calloc instead of malloc to keep valgrind happy
+    s->x = calloc(p->dim, sizeof(float));
+    s->xb = calloc(p->dim, sizeof(float));
+    s->xb2 = calloc(p->dim, sizeof(float));
+    s->hb = calloc(p->hidden_dim, sizeof(float));
+    s->hb2 = calloc(p->hidden_dim, sizeof(float));
+    s->q = calloc(p->dim, sizeof(float));
+    s->k = calloc(p->dim, sizeof(float));
+    s->v = calloc(p->dim, sizeof(float));
+    s->att = calloc(p->seq_len, sizeof(float));
+    s->logits = calloc(p->vocab_size, sizeof(float));
+    s->key_cache = calloc(p->n_layers * p->seq_len * p->dim, sizeof(float));
+    s->value_cache = calloc(p->n_layers * p->seq_len * p->dim, sizeof(float));
+    // ensure all mallocs went fine
+    if (!s->x || !s->xb || !s->xb2 || !s->hb || !s->hb2 || !s->q
+     || !s->k || !s->v || !s->att || !s->logits || !s->key_cache
+     || !s->value_cache) {
+        printf("malloc failed!\n");
+        exit(1);
+    }
+}
+void free_run_state(RunState* s) {
+    free(s->x);
+    free(s->xb);
+    free(s->xb2);
+    free(s->hb);
+    free(s->hb2);
+    free(s->q);
+    free(s->k);
+    free(s->v);
+    free(s->att);
+    free(s->logits);
+    free(s->key_cache);
+    free(s->value_cache);
+}
+void malloc_weights(TransformerWeights* w, Config* p) {
+    // we calloc instead of malloc to keep valgrind happy
+    w->token_embedding_table = calloc(p->vocab_size * p->dim, sizeof(float));
+    w->rms_att_weight = calloc(p->n_layers * p->dim, sizeof(float));
+    w->rms_ffn_weight = calloc(p->n_layers * p->dim, sizeof(float));
+    w->wq = calloc(p->n_layers * p->dim * p->dim, sizeof(float));
+    w->wk = calloc(p->n_layers * p->dim * p->dim, sizeof(float));
+    w->wv = calloc(p->n_layers * p->dim * p->dim, sizeof(float));
+    w->wo = calloc(p->n_layers * p->dim * p->dim, sizeof(float));
+    w->w1 = calloc(p->n_layers * p->hidden_dim * p->dim, sizeof(float));
+    w->w2 = calloc(p->n_layers * p->dim * p->hidden_dim, sizeof(float));
+    w->w3 = calloc(p->n_layers * p->hidden_dim * p->dim, sizeof(float));
+    w->rms_final_weight = calloc(p->dim, sizeof(float));
+    w->freq_cis_real = calloc(p->seq_len * p->dim / 2, sizeof(float));
+    w->freq_cis_imag = calloc(p->seq_len * p->dim / 2, sizeof(float));
+    // ensure all mallocs went fine
+    if (!w->token_embedding_table || !w->rms_att_weight || !w->rms_ffn_weight
+     || !w->wq || !w->wk || !w->wv || !w->wo || !w->w1 || !w->w2 || !w->w3 ||
+        !w->rms_final_weight || !w->freq_cis_real || !w->freq_cis_imag) {
+        printf("malloc failed!\n");
+        exit(1);
+    }
+}
+void free_weights(TransformerWeights* w) {
+    free(w->token_embedding_table);
+    free(w->rms_att_weight);
+    free(w->rms_ffn_weight);
+    free(w->wq);
+    free(w->wk);
+    free(w->wv);
+    free(w->wo);
+    free(w->w1);
+    free(w->w2);
+    free(w->w3);
+    free(w->rms_final_weight);
+    free(w->freq_cis_real);
+    free(w->freq_cis_imag);
+}
+// ----------------------------------------------------------------------------
+// initialization: read from checkpoint
+int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
+    if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != p->vocab_size * p->dim) return 1;
+    if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != p->n_layers * p->dim) return 1;
+    if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != p->n_layers * p->dim * p->dim) return 1;
+    if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != p->n_layers * p->dim * p->dim) return 1;
+    if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != p->n_layers * p->dim * p->dim) return 1;
+    if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != p->n_layers * p->dim * p->dim) return 1;
+    if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != p->n_layers * p->dim) return 1;
+    if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != p->n_layers * p->dim * p->hidden_dim) return 1;
+    if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != p->n_layers * p->hidden_dim * p->dim) return 1;
+    if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != p->n_layers * p->dim * p->hidden_dim) return 1;
+    if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != p->dim) return 1;
+    int head_size = p->dim / p->n_heads;
+    if (fread(w->freq_cis_real, sizeof(float), p->seq_len * head_size / 2, f) != p->seq_len * head_size / 2) return 1;
+    if (fread(w->freq_cis_imag, sizeof(float), p->seq_len * head_size / 2, f) != p->seq_len * head_size / 2) return 1;
+    return 0;
+}
+// ----------------------------------------------------------------------------
+// neural net blocks
+void accum(float *a, float *b, int size) {
+    for (int i = 0; i < size; i++) {
+        a[i] += b[i];
+    }
+}
+void rmsnorm(float* o, float* x, float* weight, int size) {
+    // calculate sum of squares
+    float ss = 0.0f;
+    for (int j = 0; j < size; j++) {
+        ss += x[j] * x[j];
+    }
+    ss /= size;
+    ss += 1e-5f;
+    ss = 1.0f / sqrt(ss);
+    // normalize and scale
+    for (int j = 0; j < size; j++) {
+        o[j] = weight[j] * (ss * x[j]);
+    }
+}
+void softmax(float* x, int size) {
+    // find max value (for numerical stability)
+    float max_val = x[0];
+    for (int i = 1; i < size; i++) {
+        if (x[i] > max_val) {
+            max_val = x[i];
+        }
+    }
+    // exp and sum
+    float sum = 0.0f;
+    for (int i = 0; i < size; i++) {
+        x[i] = exp(x[i] - max_val);
+        sum += x[i];
+    }
+    // normalize
+    for (int i = 0; i < size; i++) {
+        x[i] /= sum;
+    }
+}
+void matmul(float* xout, float* x, float* w, int n, int d) {
+    // W (d,n) @ x (n,) -> xout (d,)
+    #pragma omp parallel for
+    for (int i = 0; i < d; i++) {
+        float val = 0.0f;
+        for (int j = 0; j < n; j++) {
+            val += w[i * n + j] * x[j];
+        }
+        xout[i] = val;
+    }
+}
+void transformer(int token, int pos, Config* p, RunState* s, TransformerWeights* w) {
+    // a few convenience variables
+    float *x = s->x;
+    int dim = p->dim;
+    int hidden_dim =  p->hidden_dim;
+    int head_size = dim / p->n_heads;
+    // copy the token embedding into x
+    float* content_row = &(w->token_embedding_table[token * dim]);
+    memcpy(x, content_row, dim*sizeof(*x));
+    // pluck out the "pos" row of freq_cis_real and freq_cis_imag
+    float* freq_cis_real_row = w->freq_cis_real + pos * head_size / 2;
+    float* freq_cis_imag_row = w->freq_cis_imag + pos * head_size / 2;
+    // forward all the layers
+    for(int l = 0; l < p->n_layers; l++) {
+        // attention rmsnorm
+        rmsnorm(s->xb, x, w->rms_att_weight + l*dim, dim);
+        // qkv matmuls for this position
+        matmul(s->q, s->xb, w->wq + l*dim*dim, dim, dim);
+        matmul(s->k, s->xb, w->wk + l*dim*dim, dim, dim);
+        matmul(s->v, s->xb, w->wv + l*dim*dim, dim, dim);
+        // apply RoPE rotation to the q and k vectors for each head
+        for (int h = 0; h < p->n_heads; h++) {
+            // get the q and k vectors for this head
+            float* q = s->q + h * head_size;
+            float* k = s->k + h * head_size;
+            // rotate q and k by the freq_cis_real and freq_cis_imag
+            for (int i = 0; i < head_size; i+=2) {
+                float q0 = q[i];
+                float q1 = q[i+1];
+                float k0 = k[i];
+                float k1 = k[i+1];
+                float fcr = freq_cis_real_row[i/2];
+                float fci = freq_cis_imag_row[i/2];
+                q[i]   = q0 * fcr - q1 * fci;
+                q[i+1] = q0 * fci + q1 * fcr;
+                k[i]   = k0 * fcr - k1 * fci;
+                k[i+1] = k0 * fci + k1 * fcr;
+            }
+        }
+        // save key,value at this time step (pos) to our kv cache
+        int loff = l * p->seq_len * dim; // kv cache layer offset for convenience
+        float* key_cache_row = s->key_cache + loff + pos * dim;
+        float* value_cache_row = s->value_cache + loff + pos * dim;
+        memcpy(key_cache_row, s->k, dim*sizeof(*key_cache_row));
+        memcpy(value_cache_row, s->v, dim*sizeof(*value_cache_row));
+        // multihead attention. iterate over all heads
+        for (int h = 0; h < p->n_heads; h++) {
+            // get the query vector for this head
+            float* q = s->q + h * head_size;
+            // iterate over all timesteps, including the current one
+            for (int t = 0; t <= pos; t++) {
+                // get the key vector for this head and at this timestep
+                float* k = s->key_cache + loff + t * dim + h * head_size;
+                // calculate the attention score as the dot product of q and k
+                float score = 0.0f;
+                for (int i = 0; i < head_size; i++) {
+                    score += q[i] * k[i];
+                }
+                score /= sqrtf(head_size);
+                // save the score to the attention buffer
+                s->att[t] = score;
+            }
+            // softmax the scores to get attention weights, from 0..pos inclusively
+            softmax(s->att, pos + 1);
+            // weighted sum of the values, store back into xb
+            for (int i = 0; i < head_size; i++) {
+                float val = 0.0f;
+                for (int t = 0; t <= pos; t++) {
+                    val += s->att[t] * s->value_cache[loff + t * dim + h * head_size + i]; // note bad locality
+                }
+                s->xb[h * head_size + i] = val;
+            }
+        }
+        // final matmul to get the output of the attention
+        matmul(s->xb2, s->xb, w->wo + l*dim*dim, dim, dim);
+        // residual connection back into x
+        accum(x, s->xb2, dim);
+        // ffn rmsnorm
+        rmsnorm(s->xb, x, w->rms_ffn_weight + l*dim, dim);
+        // Now for FFN in PyTorch we have: self.w2(F.silu(self.w1(x)) * self.w3(x))
+        // first calculate self.w1(x) and self.w3(x)
+        matmul(s->hb, s->xb, w->w1 + l*dim*hidden_dim, dim, hidden_dim);
+        matmul(s->hb2, s->xb, w->w3 + l*dim*hidden_dim, dim, hidden_dim);
+        // F.silu; silu(x)=x*σ(x),where σ(x) is the logistic sigmoid
+        for (int i = 0; i < hidden_dim; i++) {
+            s->hb[i] = s->hb[i] * (1.0f / (1.0f + expf(-s->hb[i])));
+        }
+        // elementwise multiply with w3(x)
+        for (int i = 0; i < hidden_dim; i++) {
+            s->hb[i] = s->hb[i] * s->hb2[i];
+        }
+        // final matmul to get the output of the ffn
+        matmul(s->xb, s->hb, w->w2 + l*dim*hidden_dim, hidden_dim, dim);
+        // residual connection
+        accum(x, s->xb, dim);
+    }
+    // final rmsnorm
+    rmsnorm(x, x, w->rms_final_weight, dim);
+    // classifier into logits
+    matmul(s->logits, x, w->token_embedding_table, p->dim, p->vocab_size);
+}
+int sample(float* probabilities, int n) {
+    // sample index from probabilities, they must sum to 1
+    float r = (float)rand() / (float)RAND_MAX;
+    float cdf = 0.0f;
+    for (int i = 0; i < n; i++) {
+        cdf += probabilities[i];
+        if (r < cdf) {
+            return i;
+        }
+    }
+    return n - 1; // in case of rounding errors
+}
+int argmax(float* v, int n) {
+    // return argmax of v in elements 0..n
+    int max_i = 0;
+    float max_p = v[0];
+    for (int i = 1; i < n; i++) {
+        if (v[i] > max_p) {
+            max_i = i;
+            max_p = v[i];
+        }
+    }
+    return max_i;
+}
+// ----------------------------------------------------------------------------
+long time_in_ms() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return time.tv_sec * 1000 + time.tv_usec / 1000;
+}
+int main(int argc, char *argv[]) {
+    // poor man's C argparse
+    char *checkpoint = NULL;
+    float temperature = 0.9f;
+    // 'checkpoint' is necessary arg
+    if (argc < 2) {
+        printf("Usage: %s <checkpoint_file> [temperature] [seed]\n", argv[0]);
+        return 1;
+    }
+    checkpoint = argv[1];
+    // temperature is optional
+    if (argc >= 3) {
+        temperature = atof(argv[2]);
+    }
+    // seed is optional
+    if (argc >= 4) {
+        unsigned int seed = atoi(argv[3]);
+        srand(seed);
+    } else {
+        time_t current_time;
+        time(&current_time);
+        srand((unsigned int)current_time);
+    }
+    // read in the model.bin file
+    Config config;
+    TransformerWeights weights;
+    {
+        FILE *file = fopen(checkpoint, "rb");
+        if (!file) {
+            printf("Unable to open the checkpoint file %s!\n", checkpoint);
+            return 1;
+        }
+        // read in the config header
+        if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
+        // read in the Transformer weights
+        malloc_weights(&weights, &config);
+        if(checkpoint_init_weights(&weights, &config, file)) { return 1; }
+        fclose(file);
+    }
+    // read in the tokenizer.bin file
+    char** vocab = (char**)malloc(config.vocab_size * sizeof(char*));
+    {
+        FILE *file = fopen("tokenizer.bin", "rb");
+        if (!file) {
+            printf("Unable to open the tokenizer file tokenizer.bin! Run "
+            "python tokenizer.py to convert tokenizer.model -> tokenizer.bin\n");
+            return 1;
+        }
+        int len;
+        for (int i = 0; i < config.vocab_size; i++) {
+            if(fread(&len, sizeof(int), 1, file) != 1) { return 1; }
+            vocab[i] = (char *)malloc(len + 1);
+            if(fread(vocab[i], len, 1, file) != 1) { return 1; }
+            vocab[i][len] = '\0'; // add the string terminating token
+        }
+        fclose(file);
+    }
+    // create and init the application RunState
+    RunState state;
+    malloc_run_state(&state, &config);
+    // the current position we are in
+    long start = time_in_ms();
+    int next;
+    int token = 1; // 1 = BOS token in Llama-2 sentencepiece
+    int pos = 0;
+    while (pos < config.seq_len) {
+        // forward the transformer to get logits for the next token
+        transformer(token, pos, &config, &state, &weights);
+        // sample the next token
+        if(temperature == 0.0f) {
+            // greedy argmax sampling
+            next = argmax(state.logits, config.vocab_size);
+        } else {
+            // apply the temperature to the logits
+            for (int q=0; q<config.vocab_size; q++) { state.logits[q] /= temperature; }
+            // apply softmax to the logits to get the probabilities for next token
+            softmax(state.logits, config.vocab_size);
+            // we now want to sample from this distribution to get the next token
+            next = sample(state.logits, config.vocab_size);
+        }
+        printf("%s", vocab[next]);
+        fflush(stdout);
+        // advance forward
+        token = next;
+        pos++;
+    }
+    printf("\n");
+    // report our achieved tok/s
+    long end = time_in_ms();
+    printf("achieved tok/s: %f\n", config.seq_len / (double)(end-start)*1000);
+    // memory cleanup
+    free_run_state(&state);
+    free_weights(&weights);
+    for (int i = 0; i < config.vocab_size; i++) { free(vocab[i]); }
+    free(vocab);
+    return 0;
+}

run.exe ADDED Viewed

Binary file (81.5 kB). View file

sample.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""
+Sample from the trained model with PyTorch
+"""
+import os
+import pickle
+from contextlib import nullcontext
+import torch
+import tiktoken
+from model import ModelArgs, Transformer
+from tokenizer import Tokenizer
+# -----------------------------------------------------------------------------
+out_dir = 'out' # ignored if init_from is not 'resume'
+start = "" # or "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt"
+num_samples = 1 # number of samples to draw
+max_new_tokens = 100 # number of tokens generated in each sample
+temperature = 1.0 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
+top_k = 300 # retain only the top_k most likely tokens, clamp others to have 0 probability
+seed = 1337
+device = 'cuda' if torch.cuda.is_available() else 'cpu' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
+#dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
+dtype = "float32"
+compile = False # use PyTorch 2.0 to compile the model to be faster
+exec(open('configurator.py').read()) # overrides from command line or config file
+# -----------------------------------------------------------------------------
+torch.manual_seed(seed)
+torch.cuda.manual_seed(seed)
+torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
+torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
+device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
+ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
+ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
+# init from a model saved in a specific directory
+ckpt_path = os.path.join(out_dir, 'ckpt.pt')
+checkpoint = torch.load(ckpt_path, map_location=device)
+gptconf = ModelArgs(**checkpoint['model_args'])
+model = Transformer(gptconf)
+state_dict = checkpoint['model']
+unwanted_prefix = '_orig_mod.'
+for k,v in list(state_dict.items()):
+    if k.startswith(unwanted_prefix):
+        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
+model.load_state_dict(state_dict, strict=False)
+model.eval()
+model.to(device)
+if compile:
+    print("Compiling the model...")
+    model = torch.compile(model) # requires PyTorch 2.0 (optional)
+# load the tokenizer
+enc = Tokenizer()
+# encode the beginning of the prompt
+if start.startswith('FILE:'):
+    with open(start[5:], 'r', encoding='utf-8') as f:
+        start = f.read()
+start_ids = enc.encode(start, bos=True, eos=False)
+x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])
+# run generation
+with torch.no_grad():
+    with ctx:
+        for k in range(num_samples):
+            y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
+            print(enc.decode(y[0].tolist()))
+            print('---------------')

sample_data/README.md ADDED Viewed

	@@ -0,0 +1,19 @@

+This directory includes a few sample datasets to get you started.
+*   `california_housing_data*.csv` is California housing data from the 1990 US
+    Census; more information is available at:
+    https://developers.google.com/machine-learning/crash-course/california-housing-data-description
+*   `mnist_*.csv` is a small sample of the
+    [MNIST database](https://en.wikipedia.org/wiki/MNIST_database), which is
+    described at: http://yann.lecun.com/exdb/mnist/
+*   `anscombe.json` contains a copy of
+    [Anscombe's quartet](https://en.wikipedia.org/wiki/Anscombe%27s_quartet); it
+    was originally described in
+    Anscombe, F. J. (1973). 'Graphs in Statistical Analysis'. American
+    Statistician. 27 (1): 17-21. JSTOR 2682899.
+    and our copy was prepared by the
+    [vega_datasets library](https://github.com/altair-viz/vega_datasets/blob/4f67bdaad10f45e3549984e17e1b3088c731503d/vega_datasets/_data/anscombe.json).

sample_data/anscombe.json ADDED Viewed

	@@ -0,0 +1,49 @@

+[
+  {"Series":"I", "X":10.0, "Y":8.04},
+  {"Series":"I", "X":8.0, "Y":6.95},
+  {"Series":"I", "X":13.0, "Y":7.58},
+  {"Series":"I", "X":9.0, "Y":8.81},
+  {"Series":"I", "X":11.0, "Y":8.33},
+  {"Series":"I", "X":14.0, "Y":9.96},
+  {"Series":"I", "X":6.0, "Y":7.24},
+  {"Series":"I", "X":4.0, "Y":4.26},
+  {"Series":"I", "X":12.0, "Y":10.84},
+  {"Series":"I", "X":7.0, "Y":4.81},
+  {"Series":"I", "X":5.0, "Y":5.68},
+  {"Series":"II", "X":10.0, "Y":9.14},
+  {"Series":"II", "X":8.0, "Y":8.14},
+  {"Series":"II", "X":13.0, "Y":8.74},
+  {"Series":"II", "X":9.0, "Y":8.77},
+  {"Series":"II", "X":11.0, "Y":9.26},
+  {"Series":"II", "X":14.0, "Y":8.10},
+  {"Series":"II", "X":6.0, "Y":6.13},
+  {"Series":"II", "X":4.0, "Y":3.10},
+  {"Series":"II", "X":12.0, "Y":9.13},
+  {"Series":"II", "X":7.0, "Y":7.26},
+  {"Series":"II", "X":5.0, "Y":4.74},
+  {"Series":"III", "X":10.0, "Y":7.46},
+  {"Series":"III", "X":8.0, "Y":6.77},
+  {"Series":"III", "X":13.0, "Y":12.74},
+  {"Series":"III", "X":9.0, "Y":7.11},
+  {"Series":"III", "X":11.0, "Y":7.81},
+  {"Series":"III", "X":14.0, "Y":8.84},
+  {"Series":"III", "X":6.0, "Y":6.08},
+  {"Series":"III", "X":4.0, "Y":5.39},
+  {"Series":"III", "X":12.0, "Y":8.15},
+  {"Series":"III", "X":7.0, "Y":6.42},
+  {"Series":"III", "X":5.0, "Y":5.73},
+  {"Series":"IV", "X":8.0, "Y":6.58},
+  {"Series":"IV", "X":8.0, "Y":5.76},
+  {"Series":"IV", "X":8.0, "Y":7.71},
+  {"Series":"IV", "X":8.0, "Y":8.84},
+  {"Series":"IV", "X":8.0, "Y":8.47},
+  {"Series":"IV", "X":8.0, "Y":7.04},
+  {"Series":"IV", "X":8.0, "Y":5.25},
+  {"Series":"IV", "X":19.0, "Y":12.50},
+  {"Series":"IV", "X":8.0, "Y":5.56},
+  {"Series":"IV", "X":8.0, "Y":7.91},
+  {"Series":"IV", "X":8.0, "Y":6.89}
+]

sample_data/california_housing_test.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

sample_data/california_housing_train.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

sample_data/mnist_test.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:51c292478d94ec3a01461bdfa82eb0885d262eb09e615679b2d69dedb6ad09e7
+size 18289443

sample_data/mnist_train_small.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1ef64781aa03180f4f5ce504314f058f5d0227277df86060473d973cf43b033e
+size 36523880

test_all.py ADDED Viewed

	@@ -0,0 +1,53 @@

+"""
+Run simply with
+$ pytest
+"""
+import os
+import pytest # pip install pytest
+import subprocess
+import torch
+from model import ModelArgs, Transformer
+def test_argmax_inference():
+    """
+    Only the simplest test for now: run inference with temperature 0
+    (for determinism) in both C and PyTorch, and see that the sampled tokens
+    are the same.
+    """
+    test_ckpt_dir = "out" # TODO create a dummy test checkpoint for this?
+    # run C version
+    model_path = os.path.join(test_ckpt_dir, "model.bin")
+    command = ["./run", model_path, "0.0"]
+    proc = subprocess.Popen(command, stdout=subprocess.PIPE)
+    c_tokens = []
+    for line in proc.stdout:
+        token = int(line.decode('utf-8').strip())
+        c_tokens.append(token)
+    proc.wait()
+    #print(c_tokens)
+    # run PyTorch version
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    ckpt_path = os.path.join(test_ckpt_dir, "ckpt.pt")
+    checkpoint = torch.load(ckpt_path, map_location=device)
+    gptconf = ModelArgs(**checkpoint['model_args'])
+    model = Transformer(gptconf)
+    state_dict = checkpoint['model']
+    unwanted_prefix = '_orig_mod.'
+    for k,v in list(state_dict.items()):
+        if k.startswith(unwanted_prefix):
+            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
+    model.load_state_dict(state_dict, strict=False)
+    model.eval()
+    model.to(device)
+    x = torch.tensor([[1]], dtype=torch.long, device=device) # 1 is BOS
+    with torch.inference_mode():
+        y = model.generate(x, max_new_tokens=gptconf.max_seq_len, temperature=0.0)
+    pt_tokens = y[0].tolist()
+    pt_tokens = pt_tokens[1:] # remove BOS
+    #print(pt_tokens)
+    # compare
+    assert c_tokens == pt_tokens

tinystories.py ADDED Viewed

	@@ -0,0 +1,166 @@

+"""
+Download, preprocess and serve the TinyStories dataset as a DataLoader.
+"""
+import argparse
+import glob
+import json
+import os
+import random
+from typing import List
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import numpy as np
+import requests
+import torch
+import torch.distributed as dist
+from tqdm import tqdm
+from tokenizer import Tokenizer
+DATA_CACHE_DIR = "data"
+def download_file(url: str, fname: str, chunk_size=1024):
+    """Helper function to download a file from a given url"""
+    resp = requests.get(url, stream=True)
+    total = int(resp.headers.get("content-length", 0))
+    with open(fname, "wb") as file, tqdm(
+        desc=fname,
+        total=total,
+        unit="iB",
+        unit_scale=True,
+        unit_divisor=1024,
+    ) as bar:
+        for data in resp.iter_content(chunk_size=chunk_size):
+            size = file.write(data)
+            bar.update(size)
+def download():
+    """Downloads the dataset to disk."""
+    os.makedirs(DATA_CACHE_DIR, exist_ok=True)
+    # download the TinyStories dataset, unless it's already downloaded
+    data_url = "https://huggingface.co/datasets/roneneldan/TinyStories/resolve/main/TinyStories_all_data.tar.gz"
+    data_filename = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data.tar.gz")
+    if not os.path.exists(data_filename):
+        print(f"Downloading {data_url} to {data_filename}...")
+        download_file(data_url, data_filename)
+    else:
+        print(f"{data_filename} already exists, skipping download...")
+    # unpack the tar.gz file into all the data shards (json files)
+    data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
+    if not os.path.exists(data_dir):
+        os.makedirs(data_dir, exist_ok=True)
+        print(f"Unpacking {data_filename}...")
+        os.system(f"tar -xzf {data_filename} -C {data_dir}")
+    else:
+        print(f"{data_dir} already exists, skipping unpacking...")
+    # print a single example just for debugging and such
+    shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
+    with open(shard_filenames[0], "r") as f:
+        data = json.load(f)
+    print("Download done.")
+    print(f"Number of shards: {len(shard_filenames)}")
+    print(f"Example story:\n{data[0]}")
+def pretokenize():
+    enc = Tokenizer()
+    def process_shard(shard):
+        with open(shard, "r") as f:
+            data = json.load(f)
+        all_tokens = []
+        for example in tqdm(data):
+            text = example["story"]
+            text = text.strip() # get rid of leading/trailing whitespace
+            tokens = enc.encode(text, bos=True, eos=False)  # encode the text, use BOS
+            all_tokens.extend(tokens)
+        # convert to uint16 nparray
+        all_tokens = np.array(all_tokens, dtype=np.uint16)
+        # write to disk
+        tokenized_filename = shard.replace(".json", ".bin")
+        with open(tokenized_filename, "wb") as f:
+            f.write(all_tokens.tobytes())
+        print(f"Saved {tokenized_filename}")
+    # iterate the shards and tokenize all of them one by one
+    data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
+    shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
+    # process all the shards in a threadpool
+    with ThreadPoolExecutor(max_workers=8) as executor:
+        executor.map(process_shard, shard_filenames)
+    print("Done.")
+class PretokDataset(torch.utils.data.IterableDataset):
+    """Loads pretokenized examples from disk and yields them as PyTorch tensors."""
+    def __init__(self, split, max_seq_len):
+        super().__init__()
+        self.split = split
+        self.max_seq_len = max_seq_len
+    def __iter__(self):
+        # get worker info within a DataLoader
+        worker_info = torch.utils.data.get_worker_info()
+        worker_id = worker_info.id if worker_info else 0
+        # get DDP rank info
+        rank = dist.get_rank() if dist.is_initialized() else 0
+        # combine the worker_id and worker_rank to create a unique seed for rng
+        seed = 42 + worker_id + 1337 * rank
+        rng = random.Random(seed)
+        print(f"Created a PretokDataset with rng seed {seed}")
+        data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
+        shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.bin")))
+        # train/test split. let's use only shard 0 for test split, rest train
+        shard_filenames = shard_filenames[1:] if self.split == "train" else shard_filenames[:1]
+        while True:
+            rng.shuffle(shard_filenames)
+            for shard in shard_filenames:
+                # open the dataset for reading but keep it on disk with memmap
+                m = np.memmap(shard, dtype=np.uint16, mode="r")
+                num_batches = len(m) // self.max_seq_len
+                num_batches -= 1  # drop the last partial batch
+                assert num_batches > 0, "this shard is way too small? investigate."
+                ixs = list(range(num_batches))
+                rng.shuffle(ixs)
+                for ix in ixs:
+                    start = ix * self.max_seq_len
+                    end = start + self.max_seq_len + 1
+                    # calling .astype will copy the data into a new numpy array, now in RAM
+                    chunk = torch.from_numpy((m[start:end]).astype(np.int64))
+                    x = chunk[:-1]
+                    y = chunk[1:]
+                    yield x, y
+class Task:
+    @staticmethod
+    def iter_batches(split, batch_size, max_seq_len, device, num_workers=0):
+        ds = PretokDataset(split, max_seq_len)
+        dl = torch.utils.data.DataLoader(
+            ds, batch_size=batch_size, pin_memory=True, num_workers=num_workers
+        )
+        for x, y in dl:
+            x = x.to(device, non_blocking=True)
+            y = y.to(device, non_blocking=True)
+            yield x, y
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("stage", type=str, choices=["download", "train_tokenizer", "pretokenize"])
+    args = parser.parse_args()
+    # depending on the stage call the appropriate function
+    fun = {
+        "download": download,
+        "pretokenize": pretokenize,
+    }
+    fun[args.stage]()

tokenizer.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7d606805ba3c66fb99952e063d7227e38e696f34267bfa03d44b590f0490a14b
+size 304713

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

tokenizer.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# Taken from llama code and lightly modified
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+import os
+from logging import getLogger
+from typing import List
+from sentencepiece import SentencePieceProcessor
+TOKENIZER_MODEL = "tokenizer.model" # the llama sentencepiece tokenizer model
+TOKENIZER_BIN = "tokenizer.bin" # binary version of the tokenizer for inference in C
+class Tokenizer:
+    def __init__(self):
+        model_path = TOKENIZER_MODEL
+        assert os.path.isfile(model_path), model_path
+        self.sp_model = SentencePieceProcessor(model_file=model_path)
+        #print(f"Loaded SentencePiece model from {model_path}")
+        # BOS / EOS token IDs
+        self.n_words: int = self.sp_model.vocab_size()
+        self.bos_id: int = self.sp_model.bos_id()
+        self.eos_id: int = self.sp_model.eos_id()
+        self.pad_id: int = self.sp_model.pad_id()
+        #print(f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}")
+        assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
+    def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
+        assert type(s) is str
+        t = self.sp_model.encode(s)
+        if bos:
+            t = [self.bos_id] + t
+        if eos:
+            t = t + [self.eos_id]
+        return t
+    def decode(self, t: List[int]) -> str:
+        return self.sp_model.decode(t)
+    def export(self):
+        tokens = []
+        for i in range(self.n_words):
+            # decode the token and light postprocessing
+            t = self.sp_model.id_to_piece(i)
+            if i == self.bos_id:
+                t = '\n<s>\n'
+            elif i == self.eos_id:
+                t = '\n</s>\n'
+            elif len(t) == 6 and t.startswith('<0x') and t.endswith('>'):
+                t = chr(int(t[3:5], 16)) # e.g. make '<0x01>' into '\x01'
+            t = t.replace('▁', ' ') # sentencepiece uses this as the whitespace
+            tokens.append(t)
+        with open(TOKENIZER_BIN, 'wb') as f:
+            for token in tokens:
+                bytes = token.encode('utf-8')
+                f.write((len(bytes)).to_bytes(4, 'little'))  # write length of bytes
+                f.write(bytes)  # write token bytes
+if __name__ == "__main__":
+    t = Tokenizer()
+    t.export()

train.py ADDED Viewed

	@@ -0,0 +1,331 @@

+"""
+This training script can be run both on a single gpu in debug mode,
+and also in a larger training run with distributed data parallel (ddp).
+To run on a single GPU small debug run, example:
+$ python -m train.py --compile=False --eval_iters=10 --batch_size=8
+To run with DDP on 4 gpus on 1 node, example:
+$ torchrun --standalone --nproc_per_node=4 train.py
+To run with DDP on 4 gpus across 2 nodes, example:
+- Run on the first (master) node with example IP 123.456.123.456:
+$ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr=123.456.123.456 --master_port=1234 train.py
+- Run on the worker node:
+$ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_addr=123.456.123.456 --master_port=1234 train.py
+(If your cluster does not have Infiniband interconnect prepend NCCL_IB_DISABLE=1)
+"""
+import math
+import os
+import time
+from contextlib import nullcontext
+from datetime import datetime
+from functools import partial
+import torch
+from model import Transformer, ModelArgs
+from torch.distributed import destroy_process_group, init_process_group
+from torch.nn.parallel import DistributedDataParallel as DDP
+from tinystories import Task
+# -----------------------------------------------------------------------------
+# I/O
+out_dir = "out"
+eval_interval = 2000
+log_interval = 1
+eval_iters = 100
+eval_only = False  # if True, script exits right after the first eval
+always_save_checkpoint = False  # if True, always save a checkpoint after each eval
+init_from = "scratch"  # 'scratch' or 'resume'
+# wandb logging
+wandb_log = False  # disabled by default
+wandb_project = "llamac"
+wandb_run_name = "run" + datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
+# data
+batch_size = 128  # if gradient_accumulation_steps > 1, this is the micro-batch size
+max_seq_len = 256
+# model
+dim = 288
+n_layers = 6
+n_heads = 6
+multiple_of = 32
+dropout = 0.0
+# adamw optimizer
+gradient_accumulation_steps = 4  # used to simulate larger batch sizes
+learning_rate = 5e-4  # max learning rate
+max_iters = 100000  # total number of training iterations
+weight_decay = 1e-1
+beta1 = 0.9
+beta2 = 0.95
+grad_clip = 1.0  # clip gradients at this value, or disable if == 0.0
+# learning rate decay settings
+decay_lr = True  # whether to decay the learning rate
+warmup_iters = 1000  # how many steps to warm up for
+# system
+device = "cuda"  # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
+dtype = "bfloat16"  # float32|bfloat16|float16
+compile = True  # use PyTorch 2.0 to compile the model to be faster
+# -----------------------------------------------------------------------------
+config_keys = [
+    k
+    for k, v in globals().items()
+    if not k.startswith("_") and isinstance(v, (int, float, bool, str))
+]
+exec(open("configurator.py").read())  # overrides from command line or config file
+config = {k: globals()[k] for k in config_keys}  # will be useful for logging
+# -----------------------------------------------------------------------------
+# fixing some hyperparams to sensible defaults
+lr_decay_iters = max_iters  # should be ~= max_iters per Chinchilla
+min_lr = 0.0  # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
+# various inits, derived attributes, I/O setup
+ddp = int(os.environ.get("RANK", -1)) != -1  # is this a ddp run?
+if ddp:
+    init_process_group(backend="nccl")
+    ddp_rank = int(os.environ["RANK"])
+    ddp_local_rank = int(os.environ["LOCAL_RANK"])
+    ddp_world_size = int(os.environ["WORLD_SIZE"])
+    device = f"cuda:{ddp_local_rank}"
+    torch.cuda.set_device(device)
+    master_process = ddp_rank == 0  # this process will do logging, checkpointing etc.
+    seed_offset = ddp_rank  # each process gets a different seed
+    # world_size number of processes will be training simultaneously, so we can scale
+    # down the desired gradient accumulation iterations per process proportionally
+    assert gradient_accumulation_steps % ddp_world_size == 0
+    gradient_accumulation_steps //= ddp_world_size
+else:
+    # if not ddp, we are running on a single gpu, and one process
+    master_process = True
+    seed_offset = 0
+    ddp_world_size = 1
+tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * max_seq_len
+if master_process:
+    print(f"tokens per iteration will be: {tokens_per_iter:,}")
+    print(f"breaks down as: {gradient_accumulation_steps} grad accum steps * {ddp_world_size} processes * {batch_size} batch size * {max_seq_len} max seq len")
+if master_process:
+    os.makedirs(out_dir, exist_ok=True)
+torch.manual_seed(1337 + seed_offset)
+torch.backends.cuda.matmul.allow_tf32 = True  # allow tf32 on matmul
+torch.backends.cudnn.allow_tf32 = True  # allow tf32 on cudnn
+device_type = "cuda" if "cuda" in device else "cpu"  # for later use in torch.autocast
+# note: float16 data type will automatically use a GradScaler
+ptdtype = {"float32": torch.float32, "bfloat16": torch.bfloat16, "float16": torch.float16}[dtype]
+ctx = (
+    nullcontext()
+    if device_type == "cpu"
+    else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
+)
+# task-specific setup
+iter_batches = partial(
+    Task.iter_batches,
+    batch_size=batch_size,
+    max_seq_len=max_seq_len,
+    device=device,
+    num_workers=0,
+)
+# init these up here, can override if init_from='resume' (i.e. from a checkpoint)
+iter_num = 0
+best_val_loss = 1e9
+# model init
+model_args = dict(
+    dim=dim,
+    n_layers=n_layers,
+    n_heads=n_heads,
+    n_kv_heads=n_heads,
+    vocab_size=32000,
+    multiple_of=multiple_of,
+    max_seq_len=max_seq_len,
+    #dropout=dropout,
+)  # start with model_args from command line
+if init_from == "scratch":
+    # init a new model from scratch
+    print("Initializing a new model from scratch")
+    gptconf = ModelArgs(**model_args)
+    model = Transformer(gptconf)
+elif init_from == "resume":
+    print(f"Resuming training from {out_dir}")
+    # resume training from a checkpoint.
+    ckpt_path = os.path.join(out_dir, "ckpt.pt")
+    checkpoint = torch.load(ckpt_path, map_location=device)
+    checkpoint_model_args = checkpoint["model_args"]
+    # force these config attributes to be equal otherwise we can't even resume training
+    # the rest of the attributes (e.g. dropout) can stay as desired from command line
+    for k in ["dim", "n_layers", "n_heads", "n_kv_heads", "vocab_size", "multiple_of", "max_seq_len"]:
+        model_args[k] = checkpoint_model_args[k]
+    # create the model
+    gptconf = ModelArgs(**model_args)
+    model = Transformer(gptconf)
+    state_dict = checkpoint["model"]
+    # fix the keys of the state dictionary :(
+    # honestly no idea how checkpoints sometimes get this prefix, have to debug more
+    unwanted_prefix = "_orig_mod."
+    for k, v in list(state_dict.items()):
+        if k.startswith(unwanted_prefix):
+            state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
+    model.load_state_dict(state_dict)
+    iter_num = checkpoint["iter_num"]
+    best_val_loss = checkpoint["best_val_loss"]
+model.to(device)
+# initialize a GradScaler. If enabled=False scaler is a no-op
+scaler = torch.cuda.amp.GradScaler(enabled=(dtype == "float16"))
+# optimizer
+optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)
+if init_from == "resume":
+    optimizer.load_state_dict(checkpoint["optimizer"])
+checkpoint = None  # free up memory
+# compile the model
+if compile:
+    print("compiling the model... (takes a ~minute)")
+    unoptimized_model = model
+    model = torch.compile(model)  # requires PyTorch 2.0
+# wrap model into DDP container
+if ddp:
+    # Ignore the `freqs_cis` buffer so that DDP does not broadcast it at
+    # construction time since NCCL does not support `ComplexFloat`
+    prefix = "_orig_mod." if compile else ""
+    model._ddp_params_and_buffers_to_ignore = {prefix + "freqs_cis"}
+    model = DDP(model, device_ids=[ddp_local_rank])
+# helps estimate an arbitrarily accurate loss over either split using many batches
+@torch.no_grad()
+def estimate_loss():
+    out = {}
+    model.eval()
+    for split in ["train", "val"]:
+        batch_iter = iter_batches(split)
+        losses = torch.zeros(eval_iters)  # keep on CPU
+        for k in range(eval_iters):
+            X, Y = next(batch_iter)
+            with ctx:
+                logits, loss = model(X, Y)
+            losses[k] = loss.item()
+        out[split] = losses.mean()
+    model.train()
+    return out
+# learning rate decay scheduler (cosine with warmup)
+def get_lr(it):
+    # 1) linear warmup for warmup_iters steps
+    if it < warmup_iters:
+        return learning_rate * it / warmup_iters
+    # 2) if it > lr_decay_iters, return min learning rate
+    if it > lr_decay_iters:
+        return min_lr
+    # 3) in between, use cosine decay down to min learning rate
+    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
+    assert 0 <= decay_ratio <= 1
+    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))  # coeff ranges 0..1
+    return min_lr + coeff * (learning_rate - min_lr)
+# logging
+if wandb_log and master_process:
+    import wandb
+    wandb.init(project=wandb_project, name=wandb_run_name, config=config)
+# training loop
+train_batch_iter = iter_batches("train")
+X, Y = next(train_batch_iter)  # fetch the very first batch
+t0 = time.time()
+local_iter_num = 0  # number of iterations in the lifetime of this process
+raw_model = model.module if ddp else model  # unwrap DDP container if needed
+running_mfu = -1.0
+while True:
+    # determine and set the learning rate for this iteration
+    lr = get_lr(iter_num) if decay_lr else learning_rate
+    for param_group in optimizer.param_groups:
+        param_group["lr"] = lr
+    # evaluate the loss on train/val sets and write checkpoints
+    if iter_num % eval_interval == 0 and master_process:
+        losses = estimate_loss()
+        print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
+        if wandb_log:
+            try:
+                wandb.log(
+                    {
+                        "iter": iter_num,
+                        "tokens": iter_num * tokens_per_iter,
+                        "loss/train": losses["train"],
+                        "loss/val": losses["val"],
+                        "lr": lr,
+                        "mfu": running_mfu * 100,  # convert to percentage
+                    }
+                )
+            except Exception as e:
+                print(f"logging to wandb failed: {e}")
+        if losses["val"] < best_val_loss or always_save_checkpoint:
+            best_val_loss = losses["val"]
+            if iter_num > 0:
+                checkpoint = {
+                    "model": raw_model.state_dict(),
+                    "optimizer": optimizer.state_dict(),
+                    "model_args": model_args,
+                    "iter_num": iter_num,
+                    "best_val_loss": best_val_loss,
+                    "config": config,
+                }
+                print(f"saving checkpoint to {out_dir}")
+                torch.save(checkpoint, os.path.join(out_dir, "ckpt.pt"))
+                raw_model.export(os.path.join(out_dir, "model.bin"))
+    if iter_num == 0 and eval_only:
+        break
+    # forward backward update, with optional gradient accumulation to simulate larger batch size
+    # and using the GradScaler if data type is float16
+    for micro_step in range(gradient_accumulation_steps):
+        if ddp:
+            # in DDP training we only need to sync gradients at the last micro step.
+            # the official way to do this is with model.no_sync() context manager, but
+            # I really dislike that this bloats the code and forces us to repeat code
+            # looking at the source of that context manager, it just toggles this variable
+            model.require_backward_grad_sync = micro_step == gradient_accumulation_steps - 1
+        with ctx:
+            logits, loss = model(X, Y)
+            loss = loss / gradient_accumulation_steps
+        # immediately async prefetch next batch while model is doing the forward pass on the GPU
+        X, Y = next(train_batch_iter)
+        # backward pass, with gradient scaling if training in fp16
+        scaler.scale(loss).backward()
+    # clip the gradient
+    if grad_clip != 0.0:
+        scaler.unscale_(optimizer)
+        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
+    # step the optimizer and scaler if training in fp16
+    scaler.step(optimizer)
+    scaler.update()
+    # flush the gradients as soon as we can, no need for this memory anymore
+    optimizer.zero_grad(set_to_none=True)
+    # timing and logging
+    t1 = time.time()
+    dt = t1 - t0
+    t0 = t1
+    if iter_num % log_interval == 0 and master_process:
+        # get loss as float, scale up due to the divide above. note: this is a CPU-GPU sync point
+        lossf = loss.item() * gradient_accumulation_steps
+        if local_iter_num >= 5:  # let the training loop settle a bit
+            mfu = raw_model.estimate_mfu(batch_size * gradient_accumulation_steps, dt)
+            running_mfu = mfu if running_mfu == -1.0 else 0.9 * running_mfu + 0.1 * mfu
+        print(
+            f"{iter_num} | loss {lossf:.4f} | lr {lr:e} | {dt*1000:.2f}ms | mfu {running_mfu*100:.2f}%"
+        )
+    iter_num += 1
+    local_iter_num += 1
+    # termination conditions
+    if iter_num > max_iters:
+        break
+if ddp:
+    destroy_process_group()